mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 14:15:55 -06:00
Add gpu-stats.py nvitop logging script
This commit is contained in:
parent
d32d842ede
commit
6a01315846
1 changed files with 82 additions and 0 deletions
82
nvitop/gpu-stats.py
Normal file
82
nvitop/gpu-stats.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
# nvitop Python script to log stats for all Nvidia GPUs, physical and MIG.
|
||||
# You must run this within a venv that has nvitop installed.
|
||||
# by Dan MacDonald 2026
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import platform
|
||||
from datetime import datetime
|
||||
from nvitop import PhysicalDevice, MigDevice, NA
|
||||
import pynvml
|
||||
|
||||
def collect_gpu_stats():
|
||||
writer = csv.writer(sys.stdout)
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
kernel_version = platform.release()
|
||||
|
||||
try:
|
||||
# Initialize pynvml to get the driver version string correctly
|
||||
pynvml.nvmlInit()
|
||||
driver_version = pynvml.nvmlSystemGetDriverVersion()
|
||||
if isinstance(driver_version, bytes):
|
||||
driver_version = driver_version.decode('utf-8')
|
||||
|
||||
phys_devices = PhysicalDevice.all()
|
||||
for p_dev in phys_devices:
|
||||
m_devices = p_dev.mig_devices()
|
||||
if m_devices:
|
||||
for m_dev in m_devices:
|
||||
write_row(writer, timestamp, kernel_version, driver_version, m_dev, parent=p_dev)
|
||||
else:
|
||||
write_row(writer, timestamp, kernel_version, driver_version, p_dev, parent=p_dev)
|
||||
except Exception as e:
|
||||
print(f"Init error: {e}", file=sys.stderr)
|
||||
finally:
|
||||
try:
|
||||
pynvml.nvmlShutdown()
|
||||
except:
|
||||
pass
|
||||
|
||||
def write_row(writer, timestamp, kernel, driver, device, parent):
|
||||
try:
|
||||
pci = parent.pci_info()
|
||||
pci_address = pci.busId.decode('utf-8') if isinstance(pci.busId, bytes) else pci.busId
|
||||
temp = parent.temperature()
|
||||
|
||||
# --- PROCESS-BASED GPU UTILIZATION ---
|
||||
gpu_util = 0
|
||||
try:
|
||||
# First, check if there are any active processes on this specific MIG slice
|
||||
procs = device.processes()
|
||||
if procs and len(procs) > 0:
|
||||
# If processes exist, we know the slice is being utilized.
|
||||
gpu_util = 1
|
||||
else:
|
||||
# Fallback to standard query if no processes but hardware says something
|
||||
val = device.gpu_utilization()
|
||||
gpu_util = val if (val is not None and str(val) != 'N/A') else 0
|
||||
except:
|
||||
gpu_util = 0
|
||||
|
||||
mem_used = device.memory_used() / (1024**2)
|
||||
mem_total = device.memory_total() / (1024**2)
|
||||
uuid = device.uuid()
|
||||
|
||||
writer.writerow([
|
||||
timestamp,
|
||||
kernel,
|
||||
driver,
|
||||
device.index,
|
||||
pci_address,
|
||||
temp,
|
||||
gpu_util,
|
||||
f"{mem_used:.2f}",
|
||||
f"{mem_total:.2f}",
|
||||
uuid
|
||||
])
|
||||
except Exception as e:
|
||||
idx = getattr(device, 'index', 'unknown')
|
||||
print(f"Error on device {idx}: {e}", file=sys.stderr)
|
||||
|
||||
if __name__ == "__main__":
|
||||
collect_gpu_stats()
|
||||
Loading…
Add table
Add a link
Reference in a new issue