Commit 40a001dc by Duchaj János

Added the GPU usage monitoring, and all Disk IO R/W monitoring.

parent 951b1c5c
pika==1.2.0
psutil==2.1.1
pynvml==11.5.0
......@@ -9,6 +9,8 @@ import pika
import psutil
import time
import re
import subprocess
from pynvml import *
logging.basicConfig()
logger = logging.getLogger(__name__)
......@@ -138,6 +140,18 @@ class Client:
'bytes_sent', 'bytes_recv'):
metrics['network.%s-%s' %
(metric, interface)] = getattr(data, metric)
try:
for deviceCounter in range(nvmlDeviceGetCount()):
handle = nvmlDeviceGetHandleByIndex(deviceCounter)
deviceName = nvmlDeviceGetName(handle).replace(" ", "_")
deviceMemoryInfos = nvmlDeviceGetMemoryInfo(handle)
gpu_percent = deviceMemoryInfos.used / deviceMemoryInfos.total * 100
gpu_used_bytes = deviceMemoryInfos.used
metrics['gpu.percent.%s' % deviceName] = gpu_percent
metrics['gpu.used_bytes.%s' % deviceName] = gpu_used_bytes
except NVMLError as error:
logger.error('Something went wrong with GPU Monitoring:')
logger.error('Error: %s' % error)
return ['%(host)s.%(name)s %(val)f %(time)d' % {'host': self.name,
'name': name,
......@@ -218,6 +232,36 @@ class Client:
return metrics
def startIOmonitor(self):
ioTopCall = subprocess.Popen(["sh","-c","sudo iotop -ao -qqq -b -k --iter=2 -d9.5 | awk '{$1=$1};1' | cut -d'%' -f1 | cut -d' ' -f3,4,6 | sort"],stdout=subprocess.PIPE)
return ioTopCall
def collect_node_IO(self, completedIOShell):
now = time.time()
metrics = []
ProcessOut, ProcessErr = completedIOShell.communicate()
decodedIOTopOut = bytes.decode(ProcessOut)
decodedLines = decodedIOTopOut.splitlines()
IOReadWriteValsDict = {}
for line in decodedLines:
lineVals = line.split()
if (IOReadWriteValsDict.get(lineVals[0]) == None):
IOReadWriteValsDict[lineVals[0]] = [float(lineVals[1]),float(lineVals[2])]
else:
IOReadWriteValsDict[lineVals[0]][0] += float(lineVals[1])
IOReadWriteValsDict[lineVals[0]][1] += float(lineVals[2])
for metric,valueDuo in IOReadWriteValsDict.items():
rw = "read"
for value in valueDuo:
metrics.append('%(host)s.io.%(rw)s.%(name)s %(val)f %(time)d' % {'host': self.name,
'name': metric,
'rw': rw,
'val': value,
'time': now})
rw = "write"
return metrics
@staticmethod
def _chunker(seq, size):
"""Yield seq in size-long chunks.
......@@ -233,9 +277,14 @@ class Client:
"""
self.connect()
self.processes = {}
nvmlInit()
try:
runningIOshell = self.startIOmonitor()
while True:
metrics = self.collect_node() + self.collect_vms()
if runningIOshell.poll() != None:
metrics += self.collect_node_IO(runningIOshell)
runningIOshell = self.startIOmonitor()
if metrics:
for chunk in self._chunker(metrics, 100):
self.send(chunk)
......@@ -244,4 +293,5 @@ class Client:
except KeyboardInterrupt:
logger.info("Reporting has stopped by the user. Exiting...")
finally:
nvmlShutdown()
self.disconnect()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment