mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-21 06:45:24 -06:00
feat: use 'curses' library
This commit is contained in:
parent
b6bf3c20e7
commit
fbc7777221
2 changed files with 172 additions and 57 deletions
228
nvhtop.py
228
nvhtop.py
|
|
@ -29,12 +29,13 @@
|
|||
# To Run:
|
||||
# $ python nvhtop.py
|
||||
|
||||
import curses
|
||||
import datetime
|
||||
import sys
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
|
||||
import psutil
|
||||
from cachetools import cached, TTLCache
|
||||
|
||||
import pynvml as nvml
|
||||
|
||||
|
|
@ -65,10 +66,28 @@ def nvml_query(func, *args, **kwargs):
|
|||
class GProcess(psutil.Process):
|
||||
def __init__(self, pid, device, gpu_memory, type='C'):
|
||||
super(GProcess, self).__init__(pid)
|
||||
super(GProcess, self).cpu_percent()
|
||||
self.device = device
|
||||
self.gpu_memory = gpu_memory
|
||||
self.type = type
|
||||
self.cpu_percent()
|
||||
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=5.0))
|
||||
def as_dict(self):
|
||||
return {
|
||||
'device': self.device,
|
||||
'pid': self.pid,
|
||||
'username': self.username(),
|
||||
'gpu_memory': self.gpu_memory,
|
||||
'cpu_percent': self.cpu_percent(),
|
||||
'memory_percent': self.memory_percent(),
|
||||
'running_time': datetime.datetime.now() - datetime.datetime.fromtimestamp(self.create_time()),
|
||||
'cmdline': self.cmdline()
|
||||
}
|
||||
|
||||
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=30.0))
|
||||
def get_gpu_process(pid, device):
|
||||
return GProcess(pid, device, gpu_memory=0, type='')
|
||||
|
||||
|
||||
class Device(object):
|
||||
|
|
@ -86,20 +105,24 @@ class Device(object):
|
|||
__repr__ = __str__
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=5.0))
|
||||
def display_active(self):
|
||||
return {0: 'Off', 1: 'On'}.get(nvml_query(nvml.nvmlDeviceGetDisplayActive, self.handle), 'N/A')
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=5.0))
|
||||
def persistence_mode(self):
|
||||
return {0: 'Off', 1: 'On'}.get(nvml_query(nvml.nvmlDeviceGetPersistenceMode, self.handle), 'N/A')
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=2.0))
|
||||
def ecc_errors(self):
|
||||
return nvml_query(nvml.nvmlDeviceGetTotalEccErrors, self.handle,
|
||||
nvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
nvml.NVML_VOLATILE_ECC)
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=2.0))
|
||||
def fan_speed(self):
|
||||
fan_speed = nvml_query(nvml.nvmlDeviceGetFanSpeed, self.handle)
|
||||
if fan_speed != 'N/A':
|
||||
|
|
@ -107,6 +130,7 @@ class Device(object):
|
|||
return fan_speed
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=2.0))
|
||||
def utilization(self):
|
||||
utilization = nvml_query(nvml.nvmlDeviceGetUtilizationRates, self.handle).gpu
|
||||
if utilization != 'N/A':
|
||||
|
|
@ -114,6 +138,7 @@ class Device(object):
|
|||
return utilization
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=5.0))
|
||||
def compute_mode(self):
|
||||
return {
|
||||
nvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
|
||||
|
|
@ -123,6 +148,7 @@ class Device(object):
|
|||
}.get(nvml_query(nvml.nvmlDeviceGetComputeMode, self.handle), 'N/A')
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=2.0))
|
||||
def temperature(self):
|
||||
temperature = nvml_query(nvml.nvmlDeviceGetTemperature, self.handle, nvml.NVML_TEMPERATURE_GPU)
|
||||
if temperature != 'N/A':
|
||||
|
|
@ -130,6 +156,7 @@ class Device(object):
|
|||
return temperature
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=5.0))
|
||||
def performance_state(self):
|
||||
performance_state = nvml_query(nvml.nvmlDeviceGetPerformanceState, self.handle)
|
||||
if performance_state != 'N/A':
|
||||
|
|
@ -137,25 +164,46 @@ class Device(object):
|
|||
return performance_state
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=1.0))
|
||||
def memory_used(self):
|
||||
return nvml_query(lambda handle: nvml.nvmlDeviceGetMemoryInfo(handle).used, self.handle)
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=1.0))
|
||||
def power_usage(self):
|
||||
return nvml_query(nvml.nvmlDeviceGetPowerUsage, self.handle)
|
||||
|
||||
@property
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=2.0))
|
||||
def processes(self):
|
||||
process = {}
|
||||
processes = {}
|
||||
|
||||
for proc in nvml.nvmlDeviceGetComputeRunningProcesses(self.handle):
|
||||
process[proc.pid] = GProcess(pid=proc.pid, device=self, gpu_memory=proc.usedGpuMemory, type='C')
|
||||
for proc in nvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle):
|
||||
if proc.pid in process:
|
||||
process[proc.pid].type = 'C+G'
|
||||
for p in nvml.nvmlDeviceGetComputeRunningProcesses(self.handle):
|
||||
proc = processes[p.pid] = get_gpu_process(pid=p.pid, device=self)
|
||||
proc.gpu_memory = p.usedGpuMemory
|
||||
if proc.type == 'G':
|
||||
proc.type = 'C+G'
|
||||
else:
|
||||
process[proc.pid] = GProcess(pid=proc.pid, device=self, gpu_memory=proc.usedGpuMemory, type='G')
|
||||
return OrderedDict(sorted(process.items()))
|
||||
proc.type = 'C'
|
||||
for p in nvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle):
|
||||
proc = processes[p.pid] = get_gpu_process(pid=p.pid, device=self)
|
||||
proc.gpu_memory = p.usedGpuMemory
|
||||
if proc.type == 'C':
|
||||
proc.type = 'C+G'
|
||||
else:
|
||||
proc.type = 'G'
|
||||
return processes
|
||||
|
||||
@cached(cache=TTLCache(maxsize=128, ttl=1.0))
|
||||
def as_dict(self):
|
||||
return {key: getattr(self, key) for key in self._as_dict_keys}
|
||||
|
||||
_as_dict_keys = ['index', 'name',
|
||||
'persistence_mode', 'bus_id', 'display_active', 'ecc_errors',
|
||||
'fan_speed', 'temperature', 'performance_state',
|
||||
'power_usage', 'power_limit',
|
||||
'memory_used', 'memory_total',
|
||||
'utilization', 'compute_mode']
|
||||
|
||||
|
||||
class Top(object):
|
||||
|
|
@ -168,62 +216,104 @@ class Top(object):
|
|||
self.device_count = nvml.nvmlDeviceGetCount()
|
||||
self.devices = list(map(Device, range(self.device_count)))
|
||||
|
||||
def redraw(self, compact=False):
|
||||
lines = [
|
||||
time.strftime('%a %b %d %H:%M:%S %Y'),
|
||||
self.win = curses.initscr()
|
||||
curses.noecho()
|
||||
curses.cbreak()
|
||||
curses.curs_set(False)
|
||||
self.win.nodelay(True)
|
||||
|
||||
self.termsize = None
|
||||
self.n_rows = 0
|
||||
|
||||
def __del__(self):
|
||||
curses.endwin()
|
||||
|
||||
def redraw(self):
|
||||
n_used_devices = 0
|
||||
processes = {}
|
||||
for device in self.devices:
|
||||
device_processes = device.processes
|
||||
if len(device_processes) > 0:
|
||||
processes.update(device.processes)
|
||||
n_used_devices += 1
|
||||
|
||||
n_term_rows, n_term_cols = termsize = self.win.getmaxyx()
|
||||
if n_used_devices > 0:
|
||||
compact = (n_term_rows < 7 + 3 * self.device_count + 6 + len(processes) + n_used_devices - 1)
|
||||
else:
|
||||
compact = (n_term_rows < 7 + 3 * self.device_count + 7)
|
||||
|
||||
rows = [
|
||||
'{:<79}'.format(time.strftime('%a %b %d %H:%M:%S %Y')),
|
||||
'╒═════════════════════════════════════════════════════════════════════════════╕',
|
||||
'│ NVIDIA-SMI {0:<6} Driver Version: {0:<6} CUDA Version: {1:<5} │'.format(self.driver_version,
|
||||
self.cuda_version),
|
||||
'├───────────────────────────────┬──────────────────────┬──────────────────────┤'
|
||||
]
|
||||
if compact:
|
||||
lines.append('│ GPU Temp Perf Pwr:Usage/Cap│ Memory-Usage │ GPU-Util Compute M. │')
|
||||
rows.append('│ GPU Temp Perf Pwr:Usage/Cap│ Memory-Usage │ GPU-Util Compute M. │')
|
||||
else:
|
||||
lines.extend([
|
||||
rows.extend([
|
||||
'│ GPU Name Persistence-M│ Bus-Id Disp.A │ Volatile Uncorr. ECC │',
|
||||
'│ Fan Temp Perf Pwr:Usage/Cap│ Memory-Usage │ GPU-Util Compute M. │'
|
||||
])
|
||||
lines.append('╞═══════════════════════════════╪══════════════════════╪══════════════════════╡')
|
||||
|
||||
processes = {}
|
||||
rows.append('╞═══════════════════════════════╪══════════════════════╪══════════════════════╡')
|
||||
|
||||
for device in self.devices:
|
||||
name = device.name
|
||||
if len(name) > 18:
|
||||
name = name[:15] + '...'
|
||||
device_info = device.as_dict()
|
||||
if len(device_info['name']) > 18:
|
||||
device_info['name'] = device_info['name'][:15] + '...'
|
||||
|
||||
memory = '{} / {}'.format(bytes2human(device.memory_used), bytes2human(device.memory_total))
|
||||
device_info['memory'] = '{} / {}'.format(bytes2human(device_info['memory_used']),
|
||||
bytes2human(device_info['memory_total']))
|
||||
|
||||
power_usage = device.power_usage
|
||||
if power_usage != 'N/A' and device.power_limit != 'N/A':
|
||||
power = '{}W / {}W'.format(power_usage // 1000, device.power_limit // 1000)
|
||||
if device_info['power_usage'] != 'N/A' and device_info['power_limit'] != 'N / A':
|
||||
device_info['power'] = '{}W / {}W'.format(device_info['power_usage'] // 1000,
|
||||
device_info['power_limit'] // 1000)
|
||||
else:
|
||||
power = 'N/A'
|
||||
device_info['power'] = 'N/A'
|
||||
|
||||
if compact:
|
||||
lines.append(
|
||||
'│ {:>3} {:>4} {:>4} {:>12} │ {:>20} │ {:>7} {:>11} │'.format(device.index, device.temperature,
|
||||
device.performance_state,
|
||||
power, memory,
|
||||
device.utilization, device.compute_mode)
|
||||
)
|
||||
rows.append(
|
||||
'│ {:>3} {:>4} {:>4} {:>12} │ {:>20} │ {:>7} {:>11} │'.format(
|
||||
device_info['index'],
|
||||
device_info['temperature'],
|
||||
device_info['performance_state'],
|
||||
device_info['power'],
|
||||
device_info['memory'],
|
||||
device_info['utilization'],
|
||||
device_info['compute_mode']
|
||||
))
|
||||
else:
|
||||
lines.extend([
|
||||
'│ {:>3} {:>18} {:<4} │ {:<16} {:>3} │ {:>20} │'.format(device.index, name, device.persistence_mode,
|
||||
device.bus_id, device.display_active,
|
||||
device.ecc_errors),
|
||||
'│ {:>3} {:>4} {:>4} {:>12} │ {:>20} │ {:>7} {:>11} │'.format(device.fan_speed, device.temperature,
|
||||
device.performance_state,
|
||||
power, memory,
|
||||
device.utilization, device.compute_mode)
|
||||
rows.extend([
|
||||
'│ {:>3} {:>18} {:<4} │ {:<16} {:>3} │ {:>20} │'.format(
|
||||
device_info['index'],
|
||||
device_info['name'],
|
||||
device_info['persistence_mode'],
|
||||
device_info['bus_id'],
|
||||
device_info['display_active'],
|
||||
device_info['ecc_errors']
|
||||
),
|
||||
'│ {:>3} {:>4} {:>4} {:>12} │ {:>20} │ {:>7} {:>11} │'.format(
|
||||
device_info['fan_speed'],
|
||||
device_info['temperature'],
|
||||
device_info['performance_state'],
|
||||
device_info['power'],
|
||||
device_info['memory'],
|
||||
device_info['utilization'],
|
||||
device_info['compute_mode']
|
||||
)
|
||||
])
|
||||
lines.append('├───────────────────────────────┼──────────────────────┼──────────────────────┤')
|
||||
rows.append('├───────────────────────────────┼──────────────────────┼──────────────────────┤')
|
||||
|
||||
processes.update(device.processes)
|
||||
lines.pop()
|
||||
lines.append('╘═══════════════════════════════╧══════════════════════╧══════════════════════╛')
|
||||
device_processes = device.processes
|
||||
if len(device_processes) > 0:
|
||||
processes.update(device.processes)
|
||||
n_used_devices += 1
|
||||
rows.pop()
|
||||
rows.append('╘═══════════════════════════════╧══════════════════════╧══════════════════════╛')
|
||||
|
||||
lines.extend([
|
||||
rows.extend([
|
||||
' ',
|
||||
'╒═════════════════════════════════════════════════════════════════════════════╕',
|
||||
'│ Processes: │',
|
||||
|
|
@ -233,10 +323,10 @@ class Top(object):
|
|||
|
||||
if len(processes) > 0:
|
||||
processes = sorted(processes.values(), key=lambda proc: (proc.device.index, proc.username(), proc.pid))
|
||||
now_time = datetime.datetime.now()
|
||||
prev_device_index = None
|
||||
for proc in processes:
|
||||
device_index = proc.device.index
|
||||
proc_info = proc.as_dict()
|
||||
device_index = proc_info['device'].index
|
||||
cmdline = proc.cmdline()
|
||||
cmdline[0] = proc.name()
|
||||
cmdline = ' '.join(cmdline).strip()
|
||||
|
|
@ -245,33 +335,57 @@ class Top(object):
|
|||
username = proc.username()
|
||||
if len(username) >= 8:
|
||||
username = username[:6] + '+'
|
||||
running_time = now_time - datetime.datetime.fromtimestamp(proc.create_time())
|
||||
running_time = proc_info['running_time']
|
||||
if running_time.days > 1:
|
||||
running_time = '{} days'.format(running_time.days)
|
||||
else:
|
||||
hours, seconds = divmod(86400 * running_time.days + running_time.seconds, 3600)
|
||||
running_time = '{:02d}:{:02d}:{:02d}'.format(hours, *divmod(seconds, 60))
|
||||
if prev_device_index is not None and prev_device_index != device_index:
|
||||
lines.append('├─────────────────────────────────────────────────────────────────────────────┤')
|
||||
rows.append('├─────────────────────────────────────────────────────────────────────────────┤')
|
||||
prev_device_index = device_index
|
||||
lines.append(
|
||||
rows.append(
|
||||
'│ {:>3} {:>6} {:>7} {:>8} {:>5.1f} {:>5.1f} {:>8} {:<24} │'.format(
|
||||
device_index,
|
||||
proc.pid,
|
||||
username,
|
||||
bytes2human(proc.gpu_memory),
|
||||
proc.cpu_percent(),
|
||||
proc.memory_percent(),
|
||||
bytes2human(proc_info['gpu_memory']),
|
||||
proc_info['cpu_percent'],
|
||||
proc_info['memory_percent'],
|
||||
running_time,
|
||||
cmdline
|
||||
)
|
||||
)
|
||||
else:
|
||||
lines.append('│ No running compute processes found │')
|
||||
rows.append('│ No running compute processes found │')
|
||||
|
||||
lines.append('╘═════════════════════════════════════════════════════════════════════════════╛')
|
||||
rows.append('╘═════════════════════════════════════════════════════════════════════════════╛')
|
||||
|
||||
print('\n'.join(lines))
|
||||
if len(rows) < self.n_rows or termsize != self.termsize:
|
||||
self.win.clear()
|
||||
self.n_rows = len(rows)
|
||||
self.termsize = termsize
|
||||
for y, line in enumerate(rows):
|
||||
try:
|
||||
self.win.addstr(y, 0, line)
|
||||
except curses.error:
|
||||
break
|
||||
self.win.refresh()
|
||||
|
||||
def loop(self):
|
||||
key = -1
|
||||
while True:
|
||||
try:
|
||||
self.redraw()
|
||||
for i in range(10):
|
||||
key = self.win.getch()
|
||||
if key == -1 or key == ord('q'):
|
||||
break
|
||||
curses.flushinp()
|
||||
if key == ord('q'):
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -285,7 +399,7 @@ def main():
|
|||
|
||||
top = Top()
|
||||
|
||||
top.redraw()
|
||||
top.loop()
|
||||
|
||||
nvml.nvmlShutdown()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,2 +1,3 @@
|
|||
psutil
|
||||
cachetools
|
||||
nvidia-ml-py
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue