Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
CIRCLE3
/
monitor-client
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
1
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
40a001dc
authored
Apr 14, 2024
by
Duchaj János
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added the GPU usage monitoring, and all Disk IO R/W monitoring.
parent
951b1c5c
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
1 deletions
+51
-1
requirements/base.txt
+1
-1
src/client.py
+50
-0
No files found.
requirements/base.txt
View file @
40a001dc
pika==1.2.0
psutil==2.1.1
pynvml==11.5.0
src/client.py
View file @
40a001dc
...
...
@@ -9,6 +9,8 @@ import pika
import
psutil
import
time
import
re
import
subprocess
from
pynvml
import
*
logging
.
basicConfig
()
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -138,6 +140,18 @@ class Client:
'bytes_sent'
,
'bytes_recv'
):
metrics
[
'network.
%
s-
%
s'
%
(
metric
,
interface
)]
=
getattr
(
data
,
metric
)
try
:
for
deviceCounter
in
range
(
nvmlDeviceGetCount
()):
handle
=
nvmlDeviceGetHandleByIndex
(
deviceCounter
)
deviceName
=
nvmlDeviceGetName
(
handle
)
.
replace
(
" "
,
"_"
)
deviceMemoryInfos
=
nvmlDeviceGetMemoryInfo
(
handle
)
gpu_percent
=
deviceMemoryInfos
.
used
/
deviceMemoryInfos
.
total
*
100
gpu_used_bytes
=
deviceMemoryInfos
.
used
metrics
[
'gpu.percent.
%
s'
%
deviceName
]
=
gpu_percent
metrics
[
'gpu.used_bytes.
%
s'
%
deviceName
]
=
gpu_used_bytes
except
NVMLError
as
error
:
logger
.
error
(
'Something went wrong with GPU Monitoring:'
)
logger
.
error
(
'Error:
%
s'
%
error
)
return
[
'
%(host)
s.
%(name)
s
%(val)
f
%(time)
d'
%
{
'host'
:
self
.
name
,
'name'
:
name
,
...
...
@@ -218,6 +232,36 @@ class Client:
return
metrics
def
startIOmonitor
(
self
):
ioTopCall
=
subprocess
.
Popen
([
"sh"
,
"-c"
,
"sudo iotop -ao -qqq -b -k --iter=2 -d9.5 | awk '{$1=$1};1' | cut -d'
%
' -f1 | cut -d' ' -f3,4,6 | sort"
],
stdout
=
subprocess
.
PIPE
)
return
ioTopCall
def
collect_node_IO
(
self
,
completedIOShell
):
now
=
time
.
time
()
metrics
=
[]
ProcessOut
,
ProcessErr
=
completedIOShell
.
communicate
()
decodedIOTopOut
=
bytes
.
decode
(
ProcessOut
)
decodedLines
=
decodedIOTopOut
.
splitlines
()
IOReadWriteValsDict
=
{}
for
line
in
decodedLines
:
lineVals
=
line
.
split
()
if
(
IOReadWriteValsDict
.
get
(
lineVals
[
0
])
==
None
):
IOReadWriteValsDict
[
lineVals
[
0
]]
=
[
float
(
lineVals
[
1
]),
float
(
lineVals
[
2
])]
else
:
IOReadWriteValsDict
[
lineVals
[
0
]][
0
]
+=
float
(
lineVals
[
1
])
IOReadWriteValsDict
[
lineVals
[
0
]][
1
]
+=
float
(
lineVals
[
2
])
for
metric
,
valueDuo
in
IOReadWriteValsDict
.
items
():
rw
=
"read"
for
value
in
valueDuo
:
metrics
.
append
(
'
%(host)
s.io.
%(rw)
s.
%(name)
s
%(val)
f
%(time)
d'
%
{
'host'
:
self
.
name
,
'name'
:
metric
,
'rw'
:
rw
,
'val'
:
value
,
'time'
:
now
})
rw
=
"write"
return
metrics
@staticmethod
def
_chunker
(
seq
,
size
):
"""Yield seq in size-long chunks.
...
...
@@ -233,9 +277,14 @@ class Client:
"""
self
.
connect
()
self
.
processes
=
{}
nvmlInit
()
try
:
runningIOshell
=
self
.
startIOmonitor
()
while
True
:
metrics
=
self
.
collect_node
()
+
self
.
collect_vms
()
if
runningIOshell
.
poll
()
!=
None
:
metrics
+=
self
.
collect_node_IO
(
runningIOshell
)
runningIOshell
=
self
.
startIOmonitor
()
if
metrics
:
for
chunk
in
self
.
_chunker
(
metrics
,
100
):
self
.
send
(
chunk
)
...
...
@@ -244,4 +293,5 @@ class Client:
except
KeyboardInterrupt
:
logger
.
info
(
"Reporting has stopped by the user. Exiting..."
)
finally
:
nvmlShutdown
()
self
.
disconnect
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment