From a306d69a36cec6d0a1554042d85024708dadb400 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 29 Apr 2026 20:13:04 +0800 Subject: [PATCH 1/3] Add mx-smi MetaX GPU backend --- nvitop/__init__.py | 2 + nvitop/api/__init__.py | 2 + nvitop/api/device.py | 248 +++++++++++- nvitop/api/libmxsmi.py | 467 +++++++++++++++++++++++ nvitop/tui/screens/main/panels/device.py | 11 +- 5 files changed, 707 insertions(+), 23 deletions(-) create mode 100644 nvitop/api/libmxsmi.py diff --git a/nvitop/__init__.py b/nvitop/__init__.py index 333b722..4dc28f2 100644 --- a/nvitop/__init__.py +++ b/nvitop/__init__.py @@ -27,6 +27,7 @@ from nvitop.api import ( host, libcuda, libcudart, + libmxsmi, libnvml, process, termcolor, @@ -46,6 +47,7 @@ for submodule in ( host, libcuda, libcudart, + libmxsmi, libnvml, process, termcolor, diff --git a/nvitop/api/__init__.py b/nvitop/api/__init__.py index ba9479d..1954204 100644 --- a/nvitop/api/__init__.py +++ b/nvitop/api/__init__.py @@ -23,6 +23,7 @@ from nvitop.api import ( host, libcuda, libcudart, + libmxsmi, libnvml, process, termcolor, @@ -69,6 +70,7 @@ __all__ = [ # noqa: RUF022 'NVMLError', 'nvmlCheckReturn', 'libnvml', + 'libmxsmi', 'libcuda', 'libcudart', # nvitop.api.device diff --git a/nvitop/api/device.py b/nvitop/api/device.py index af52be7..74df11d 100644 --- a/nvitop/api/device.py +++ b/nvitop/api/device.py @@ -117,7 +117,7 @@ import time from collections import OrderedDict from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, overload -from nvitop.api import host, libcuda, libcudart, libnvml +from nvitop.api import host, libcuda, libcudart, libmxsmi, libnvml from nvitop.api.process import GpuProcess from nvitop.api.utils import ( NA, @@ -240,6 +240,38 @@ _VALUE_OMITTED: str = ValueOmitted() # type: ignore[assignment] del ValueOmitted +_ACTIVE_BACKEND: str | None = None +_ACTIVE_BACKEND_LOCK: threading.RLock = threading.RLock() + + +def _set_active_backend(backend: str) -> None: + global _ACTIVE_BACKEND # pylint: disable=global-statement + + with _ACTIVE_BACKEND_LOCK: + _ACTIVE_BACKEND = backend + + +def _get_active_backend() -> str | None: + with _ACTIVE_BACKEND_LOCK: + return _ACTIVE_BACKEND + + +def _should_use_mxsmi_backend() -> bool: + return libmxsmi.is_forced() or _get_active_backend() == 'mx-smi' + + +@contextlib.contextmanager +def _nvml_probe() -> Generator[None]: + suppress_logs = libmxsmi.is_available() + logger_disabled = libnvml.LOGGER.disabled + if suppress_logs: + libnvml.LOGGER.disabled = True + try: + yield + finally: + libnvml.LOGGER.disabled = logger_disabled + + class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods """Live class of the GPU devices, different from the device snapshots. @@ -333,9 +365,33 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me """Test whether there are any devices and the NVML library is successfully loaded.""" try: return cls.count() > 0 - except libnvml.NVMLError: + except (libnvml.NVMLError, libmxsmi.MxSmiError): return False + @staticmethod + def backend() -> str: + """Return the active GPU query backend.""" + active_backend = _get_active_backend() + if libmxsmi.is_forced(): + return 'mx-smi' + if active_backend is not None: + return active_backend + try: + with _nvml_probe(): + device_count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0) + if device_count > 0: + _set_active_backend('nvml') + return 'nvml' + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + return 'mx-smi' + raise + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + return 'mx-smi' + return 'nvml' + @staticmethod def driver_version() -> str | NaType: """The version of the installed NVIDIA display driver. This is an alphanumeric string. @@ -355,7 +411,18 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - return libnvml.nvmlQuery('nvmlSystemGetDriverVersion') + if _should_use_mxsmi_backend(): + return libmxsmi.driver_version() + try: + with _nvml_probe(): + driver_version = libnvml.nvmlQuery('nvmlSystemGetDriverVersion') + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + return libmxsmi.driver_version() + raise + _set_active_backend('nvml') + return driver_version @staticmethod def cuda_driver_version() -> str | NaType: @@ -375,7 +442,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion') + if _should_use_mxsmi_backend(): + return libmxsmi.maca_version() + try: + with _nvml_probe(): + cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion') + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + return libmxsmi.maca_version() + raise + _set_active_backend('nvml') if libnvml.nvmlCheckReturn(cuda_driver_version, int): major = cuda_driver_version // 1000 minor = (cuda_driver_version % 1000) // 10 @@ -423,7 +500,22 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0) + if _should_use_mxsmi_backend(): + return libmxsmi.device_count() + try: + with _nvml_probe(): + count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0) + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + return libmxsmi.device_count() + raise + if count == 0 and libmxsmi.is_available(): + _set_active_backend('mx-smi') + return libmxsmi.device_count() + if count > 0: + _set_active_backend('nvml') + return count @classmethod def all(cls) -> list[PhysicalDevice]: @@ -700,36 +792,50 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me self._is_mig_device: bool | None = None self._cuda_index: int | None = None self._cuda_compute_capability: tuple[int, int] | NaType | None = None + self._backend: str = 'nvml' self._handle: libnvml.c_nvmlDevice_t | None - if index is not None: + if _should_use_mxsmi_backend(): + self._init_mxsmi(index=index, uuid=uuid, bus_id=bus_id) + elif index is not None: self._nvml_index = index # type: ignore[assignment] try: - self._handle = libnvml.nvmlQuery( - 'nvmlDeviceGetHandleByIndex', - index, - ignore_errors=False, - ) + with _nvml_probe(): + self._handle = libnvml.nvmlQuery( + 'nvmlDeviceGetHandleByIndex', + index, + ignore_errors=False, + ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._name = 'ERROR: GPU is Lost' except libnvml.NVMLError_Unknown: self._handle = None self._name = 'ERROR: Unknown' + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + self._init_mxsmi(index=index) + else: + raise + else: + _set_active_backend('nvml') else: try: if uuid is not None: - self._handle = libnvml.nvmlQuery( - 'nvmlDeviceGetHandleByUUID', - uuid, - ignore_errors=False, - ) + with _nvml_probe(): + self._handle = libnvml.nvmlQuery( + 'nvmlDeviceGetHandleByUUID', + uuid, + ignore_errors=False, + ) else: - self._handle = libnvml.nvmlQuery( - 'nvmlDeviceGetHandleByPciBusId', - bus_id, - ignore_errors=False, - ) + with _nvml_probe(): + self._handle = libnvml.nvmlQuery( + 'nvmlDeviceGetHandleByPciBusId', + bus_id, + ignore_errors=False, + ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._nvml_index = NA # type: ignore[assignment] @@ -738,7 +844,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me self._handle = None self._nvml_index = NA # type: ignore[assignment] self._name = 'ERROR: Unknown' + except libnvml.NVMLError: + if libmxsmi.is_available(): + _set_active_backend('mx-smi') + self._init_mxsmi(uuid=uuid, bus_id=bus_id) + else: + raise else: + _set_active_backend('nvml') self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle) self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA) @@ -747,6 +860,36 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me self._ident: tuple[Hashable, str] = (self.index, self.uuid()) self._hash: int | None = None + def _init_mxsmi( + self, + *, + index: int | tuple[int, int] | bytes | None = None, + uuid: bytes | None = None, + bus_id: bytes | None = None, + ) -> None: + """Initialize this device from the MetaX ``mx-smi`` backend.""" + if isinstance(index, tuple): + raise libnvml.NVMLError_NotSupported + try: + info = libmxsmi.get_device(index=index, uuid=uuid, bus_id=bus_id) + except libmxsmi.MxSmiDeviceNotFound as ex: + raise libnvml.NVMLError_NotFound from ex + + _set_active_backend('mx-smi') + self._backend = 'mx-smi' + self._handle = None + self._nvml_index = info.index + self._name = info.name + self._uuid = info.uuid + self._bus_id = info.bus_id + self._memory_total = info.memory_total + + def _is_mxsmi_device(self) -> bool: + return self._backend == 'mx-smi' + + def _mxsmi_info(self) -> libmxsmi.DeviceInfo: + return libmxsmi.get_device(index=self.physical_index) + def __repr__(self) -> str: """Return a string representation of the device.""" return '{}(index={}, name={!r}, total_memory={})'.format( # noqa: UP032 @@ -904,6 +1047,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ + if self._is_mxsmi_device(): + self._name = self._mxsmi_info().name + return self._name if self._handle is not None and self._name is NA: self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self._handle) return self._name @@ -922,6 +1068,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ + if self._is_mxsmi_device(): + self._uuid = self._mxsmi_info().uuid + return self._uuid if self._handle is not None and self._uuid is NA: self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self._handle) return self._uuid @@ -938,6 +1087,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pci.bus_id """ + if self._is_mxsmi_device(): + self._bus_id = self._mxsmi_info().bus_id + return self._bus_id if self._handle is not None and self._bus_id is NA: self._bus_id = libnvml.nvmlQuery( lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId, @@ -959,6 +1111,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=serial """ + if self._is_mxsmi_device(): + return NA if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetSerial', self._handle) return NA @@ -970,6 +1124,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: MemoryInfo(total, free, used, reserved) A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable. """ + if self._is_mxsmi_device(): + info = self._mxsmi_info() + return MemoryInfo( + total=info.memory_total, + free=info.memory_free, + used=info.memory_used, + reserved=NA, + ) if self._handle is not None: has_unified_memory = False try: @@ -1179,6 +1341,15 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: UtilizationRates(gpu, memory, encoder, decoder) A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long + if self._is_mxsmi_device(): + info = self._mxsmi_info() + return UtilizationRates( + gpu=info.gpu_utilization, + memory=info.memory_utilization, + encoder=NA, + decoder=NA, + ) + gpu, memory, encoder, decoder = NA, NA, NA, NA if self._handle is not None: @@ -1449,6 +1620,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=fan.speed """ # pylint: disable=line-too-long + if self._is_mxsmi_device(): + return self._mxsmi_info().fan_speed if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self._handle) return NA @@ -1465,6 +1638,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=temperature.gpu """ + if self._is_mxsmi_device(): + return self._mxsmi_info().temperature if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetTemperature', @@ -1486,6 +1661,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 )) """ + if self._is_mxsmi_device(): + return self._mxsmi_info().power_usage if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self._handle) return NA @@ -1507,6 +1684,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 )) """ + if self._is_mxsmi_device(): + return self._mxsmi_info().power_limit if self._handle is not None: return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self._handle) return NA @@ -1547,6 +1726,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] The current PCIe transmit throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ + if self._is_mxsmi_device(): + return NA if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', @@ -1565,6 +1746,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] The current PCIe receive throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ + if self._is_mxsmi_device(): + return NA if self._handle is not None: return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', @@ -2131,6 +2314,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=persistence_mode """ # pylint: disable=line-too-long + if self._is_mxsmi_device(): + return self._mxsmi_info().persistence_mode if self._handle is not None: return { 0: 'Disabled', @@ -2150,6 +2335,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pstate """ # pylint: disable=line-too-long + if self._is_mxsmi_device(): + return self._mxsmi_info().performance_state if self._handle is not None: performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self._handle) if libnvml.nvmlCheckReturn(performance_state, int): @@ -2194,6 +2381,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_mode """ # pylint: disable=line-too-long + if self._is_mxsmi_device(): + return 'Default' if self._handle is not None: return { libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default', @@ -2215,6 +2404,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_cap """ + if self._is_mxsmi_device(): + return NA if self._handle is not None: if self._cuda_compute_capability is None: self._cuda_compute_capability = libnvml.nvmlQuery( @@ -2226,6 +2417,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me def is_mig_device(self) -> bool: """Return whether or not the device is a MIG device.""" + if self._is_mxsmi_device(): + return False if self._handle is not None: if self._is_mig_device is None: is_mig_device = libnvml.nvmlQuery( @@ -2253,6 +2446,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=mig.mode.current """ + if self._is_mxsmi_device(): + return NA if self._handle is None: return NA if self.is_mig_device(): @@ -2313,6 +2508,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Dict[int, GpuProcess] A dictionary mapping PID to GPU process instance. """ + if self._is_mxsmi_device(): + processes = {} + for process in libmxsmi.processes(self.physical_index): + processes[process.pid] = self.GPU_PROCESS_CLASS( + pid=process.pid, + device=self, + gpu_memory=process.used_memory, + type='C', + ) + return processes + if self._handle is None: return {} diff --git a/nvitop/api/libmxsmi.py b/nvitop/api/libmxsmi.py new file mode 100644 index 0000000..2b1856e --- /dev/null +++ b/nvitop/api/libmxsmi.py @@ -0,0 +1,467 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2025 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities for querying MetaX GPUs through ``mx-smi``.""" + +from __future__ import annotations + +import os +import re +import shutil +import subprocess +import threading +import time +from dataclasses import dataclass, replace + +from nvitop.api.utils import MiB, NA, NaType + + +__all__ = [ + 'DeviceInfo', + 'MxSmiError', + 'MxSmiDeviceNotFound', + 'MxSmiNotFound', + 'MxSmiSnapshot', + 'ProcessInfo', + 'clear_cache', + 'device_count', + 'driver_version', + 'get_device', + 'is_available', + 'is_forced', + 'maca_version', + 'processes', + 'snapshot', +] + + +@dataclass(frozen=True) +class DeviceInfo: + """MetaX GPU device information collected from ``mx-smi``.""" + + index: int + name: str | NaType = NA + uuid: str | NaType = NA + bus_id: str | NaType = NA + state: str | NaType = NA + persistence_mode: str | NaType = NA + performance_state: str | NaType = NA + memory_total: int | NaType = NA + memory_used: int | NaType = NA + memory_free: int | NaType = NA + gpu_utilization: int | NaType = NA + memory_utilization: int | NaType = NA + temperature: int | NaType = NA + power_usage: int | NaType = NA + power_limit: int | NaType = NA + fan_speed: int | NaType = NA + + +@dataclass(frozen=True) +class ProcessInfo: + """MetaX GPU process information collected from ``mx-smi``.""" + + gpu_index: int + pid: int + name: str | NaType = NA + used_memory: int | NaType = NA + + +@dataclass(frozen=True) +class MxSmiSnapshot: + """A single ``mx-smi`` sample.""" + + devices: dict[int, DeviceInfo] + processes: list[ProcessInfo] + driver_version: str | NaType = NA + maca_version: str | NaType = NA + mxsmi_version: str | NaType = NA + + +class MxSmiError(RuntimeError): + """Base exception for ``mx-smi`` query errors.""" + + +class MxSmiNotFound(MxSmiError): + """Raised when the ``mx-smi`` executable is not available.""" + + +class MxSmiDeviceNotFound(MxSmiError): + """Raised when a MetaX GPU device cannot be found.""" + + +_BACKEND_ENVVAR = 'NVITOP_GPU_BACKEND' +_CACHE_TTL = 0.25 +_CACHE_LOCK = threading.RLock() +_CACHE: MxSmiSnapshot | None = None +_CACHE_EXPIRES_AT = 0.0 + +_LIST_RE = re.compile( + r'^GPU#(?P\d+)\s+' + r'(?P.+?)\s+' + r'(?P[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])\s+' + r'(?P.*?)\s+' + r'\(UUID:\s*(?P[^)]+)\)\s*$', +) +_MXSMI_VERSION_RE = re.compile(r'\bmx-smi\s+version:\s*(?P\S+)', flags=re.IGNORECASE) +_DRIVER_VERSION_RE = re.compile(r'Kernel Mode Driver Version:\s*(?P[^\s|]+)') +_MACA_VERSION_RE = re.compile(r'MACA Version:\s*(?P[^\s|]+)') +_SUMMARY_FIRST_RE = re.compile( + r'^(?P\d+)\s+(?P.+?)\s+(?POn|Off|Enable|Disable|Enabled|Disabled)\s*$', +) +_GPU_UTIL_RE = re.compile(r'(?P\d+(?:\.\d+)?)\s*%') +_SUMMARY_SECOND_RE = re.compile( + r'^(?P\d+(?:\.\d+)?)C\s+' + r'(?P\d+(?:\.\d+)?)W\s*/\s*' + r'(?P\d+(?:\.\d+)?)W\s+' + r'(?P\S+)', +) +_MEMORY_RE = re.compile( + r'(?P\d+(?:\.\d+)?)\s*/\s*(?P\d+(?:\.\d+)?)\s*MiB', + flags=re.IGNORECASE, +) +_PROCESS_RE = re.compile( + r'^\|\s*(?P\d+)\s+' + r'(?P\d+)\s+' + r'(?P.*?)\s+' + r'(?P\d+(?:\.\d+)?|N/A)\s*\|\s*$', +) + + +def is_forced() -> bool: + """Return whether the MetaX backend was explicitly requested.""" + backend = os.getenv(_BACKEND_ENVVAR, default='').strip().lower().replace('_', '-') + return backend in {'mx-smi', 'mxsmi', 'metax'} + + +def is_available() -> bool: + """Return whether ``mx-smi`` can see at least one MetaX GPU.""" + if shutil.which('mx-smi') is None: + return False + try: + return device_count() > 0 + except MxSmiError: + return False + + +def device_count() -> int: + """Return the number of MetaX GPUs visible to ``mx-smi``.""" + return len(snapshot().devices) + + +def driver_version() -> str | NaType: + """Return the MetaX kernel mode driver version.""" + return snapshot().driver_version + + +def maca_version() -> str | NaType: + """Return the MACA runtime version reported by ``mx-smi``.""" + return snapshot().maca_version + + +def get_device( + *, + index: int | bytes | None = None, + uuid: str | bytes | None = None, + bus_id: str | bytes | None = None, +) -> DeviceInfo: + """Return a MetaX device by index, UUID, or PCI bus ID.""" + if sum(arg is not None for arg in (index, uuid, bus_id)) != 1: + raise TypeError('get_device() expects exactly one identifier.') + + devices = snapshot().devices + if index is not None: + try: + return devices[int(index)] + except (KeyError, TypeError, ValueError) as ex: + raise MxSmiDeviceNotFound(f'MetaX GPU index {index!r} was not found.') from ex + + identifier = _normalize_identifier(uuid if uuid is not None else bus_id) + for device in devices.values(): + if identifier in {_normalize_identifier(device.uuid), _normalize_identifier(device.bus_id)}: + return device + + raise MxSmiDeviceNotFound(f'MetaX GPU {identifier!r} was not found.') + + +def processes(index: int) -> list[ProcessInfo]: + """Return processes reported by ``mx-smi`` for the given GPU index.""" + return [process for process in snapshot().processes if process.gpu_index == index] + + +def snapshot(*, ttl: float = _CACHE_TTL) -> MxSmiSnapshot: + """Take or return a cached ``mx-smi`` snapshot.""" + global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement + + now = time.monotonic() + with _CACHE_LOCK: + if _CACHE is not None and now < _CACHE_EXPIRES_AT: + return _CACHE + + current = _take_snapshot() + + with _CACHE_LOCK: + _CACHE = current + _CACHE_EXPIRES_AT = time.monotonic() + ttl + return _CACHE + + +def clear_cache() -> None: + """Clear the cached ``mx-smi`` snapshot.""" + global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement + + with _CACHE_LOCK: + _CACHE = None + _CACHE_EXPIRES_AT = 0.0 + + +def _take_snapshot() -> MxSmiSnapshot: + listed_devices, listed_mxsmi_version = _parse_list_output(_run_mxsmi('-L')) + summary = _parse_summary_output(_run_mxsmi()) + + devices = listed_devices.copy() + for index, device in summary.devices.items(): + base = devices.get(index, DeviceInfo(index=index)) + devices[index] = replace( + base, + name=device.name if device.name is not NA else base.name, + bus_id=device.bus_id if device.bus_id is not NA else base.bus_id, + state=device.state if device.state is not NA else base.state, + persistence_mode=( + device.persistence_mode + if device.persistence_mode is not NA + else base.persistence_mode + ), + performance_state=( + device.performance_state + if device.performance_state is not NA + else base.performance_state + ), + memory_total=device.memory_total, + memory_used=device.memory_used, + memory_free=device.memory_free, + gpu_utilization=device.gpu_utilization, + memory_utilization=device.memory_utilization, + temperature=device.temperature, + power_usage=device.power_usage, + power_limit=device.power_limit, + fan_speed=device.fan_speed, + ) + + return MxSmiSnapshot( + devices=devices, + processes=summary.processes, + driver_version=summary.driver_version, + maca_version=summary.maca_version, + mxsmi_version=summary.mxsmi_version if summary.mxsmi_version is not NA else listed_mxsmi_version, + ) + + +def _run_mxsmi(*args: str) -> str: + executable = shutil.which('mx-smi') + if executable is None: + raise MxSmiNotFound('The `mx-smi` executable was not found.') + + command = [executable, *args] + try: + completed = subprocess.run( # noqa: S603 + command, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding='utf-8', + errors='replace', + timeout=10.0, + ) + except (OSError, subprocess.SubprocessError) as ex: + raise MxSmiError(f'Failed to run `{_command_to_string(command)}`.') from ex + + if completed.returncode != 0: + output = completed.stdout.strip() + message = f'`{_command_to_string(command)}` exited with status {completed.returncode}.' + if output: + message = f'{message}\n{output}' + raise MxSmiError(message) + + return completed.stdout + + +def _parse_list_output(output: str) -> tuple[dict[int, DeviceInfo], str | NaType]: + devices: dict[int, DeviceInfo] = {} + mxsmi_version: str | NaType = NA + + for line in output.splitlines(): + version_match = _MXSMI_VERSION_RE.search(line) + if version_match is not None: + mxsmi_version = version_match.group('version') + continue + + match = _LIST_RE.match(line.strip()) + if match is None: + continue + + index = int(match.group('index')) + devices[index] = DeviceInfo( + index=index, + name=match.group('name').strip(), + uuid=match.group('uuid').strip(), + bus_id=match.group('bus_id').strip(), + state=match.group('state').strip() or NA, + ) + + return devices, mxsmi_version + + +def _parse_summary_output(output: str) -> MxSmiSnapshot: + devices: dict[int, DeviceInfo] = {} + processes: list[ProcessInfo] = [] + driver_version: str | NaType = NA + maca_version: str | NaType = NA + mxsmi_version: str | NaType = NA + lines = output.splitlines() + + for lineno, line in enumerate(lines): + version_match = _MXSMI_VERSION_RE.search(line) + if version_match is not None: + mxsmi_version = version_match.group('version') + + driver_match = _DRIVER_VERSION_RE.search(line) + if driver_match is not None: + driver_version = driver_match.group('version') + + maca_match = _MACA_VERSION_RE.search(line) + if maca_match is not None: + maca_version = maca_match.group('version') + + parts = _split_table_line(line) + if len(parts) != 3: + continue + + first_match = _SUMMARY_FIRST_RE.match(parts[0]) + if first_match is None: + continue + + try: + next_parts = _split_table_line(lines[lineno + 1]) + except IndexError: + continue + if len(next_parts) != 3: + continue + + second_match = _SUMMARY_SECOND_RE.match(next_parts[0]) + memory_match = _MEMORY_RE.search(next_parts[1]) + if second_match is None or memory_match is None: + continue + + index = int(first_match.group('index')) + memory_used = _mib_to_bytes(memory_match.group('used')) + memory_total = _mib_to_bytes(memory_match.group('total')) + memory_free = ( + memory_total - memory_used + if isinstance(memory_total, int) and isinstance(memory_used, int) + else NA + ) + devices[index] = DeviceInfo( + index=index, + name=first_match.group('name').strip(), + bus_id=parts[1].strip(), + state=next_parts[2].strip() or NA, + persistence_mode=_normalize_mode(first_match.group('persistence')), + performance_state=second_match.group('performance_state'), + memory_total=memory_total, + memory_used=memory_used, + memory_free=memory_free, + gpu_utilization=_percent_to_int(parts[2]), + temperature=round(float(second_match.group('temperature'))), + power_usage=_watts_to_milliwatts(second_match.group('power_usage')), + power_limit=_watts_to_milliwatts(second_match.group('power_limit')), + ) + + in_process_table = False + for line in lines: + if '| Process:' in line: + in_process_table = True + continue + if not in_process_table: + continue + if 'no process found' in line.lower(): + continue + + process_match = _PROCESS_RE.match(line) + if process_match is None: + continue + + processes.append( + ProcessInfo( + gpu_index=int(process_match.group('gpu_index')), + pid=int(process_match.group('pid')), + name=process_match.group('name').strip() or NA, + used_memory=_mib_to_bytes(process_match.group('used_memory')), + ), + ) + + return MxSmiSnapshot( + devices=devices, + processes=processes, + driver_version=driver_version, + maca_version=maca_version, + mxsmi_version=mxsmi_version, + ) + + +def _split_table_line(line: str) -> list[str]: + if not line.startswith('|'): + return [] + return [part.strip() for part in line.strip().strip('|').split('|')] + + +def _mib_to_bytes(value: str) -> int | NaType: + if value.upper() == 'N/A': + return NA + return round(float(value) * MiB) + + +def _watts_to_milliwatts(value: str) -> int: + return round(float(value) * 1000) + + +def _percent_to_int(value: str) -> int | NaType: + match = _GPU_UTIL_RE.search(value) + if match is None: + return NA + return round(float(match.group('util'))) + + +def _normalize_mode(value: str) -> str | NaType: + normalized = value.strip().lower() + if normalized in {'on', 'enable', 'enabled'}: + return 'Enabled' + if normalized in {'off', 'disable', 'disabled'}: + return 'Disabled' + return NA + + +def _normalize_identifier(value: str | bytes | NaType | None) -> str: + if isinstance(value, bytes): + value = value.decode('utf-8', errors='replace') + if value is None or value is NA: + return '' + return str(value).strip().lower() + + +def _command_to_string(command: list[str]) -> str: + return ' '.join(command) diff --git a/nvitop/tui/screens/main/panels/device.py b/nvitop/tui/screens/main/panels/device.py index f5d1161..365ed76 100644 --- a/nvitop/tui/screens/main/panels/device.py +++ b/nvitop/tui/screens/main/panels/device.py @@ -79,8 +79,15 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes if self.device_count == 0: self.height = self.full_height = self.compact_height = 6 + self.backend: str = Device.backend() self.driver_version: str = Device.driver_version() self.cuda_driver_version: str = Device.cuda_driver_version() + self.driver_version_label: str = ( + 'KMD Version' if self.backend == 'mx-smi' else 'Driver Version' + ) + self.cuda_driver_version_label: str = ( + 'MACA Version' if self.backend == 'mx-smi' else 'CUDA Driver Version' + ) self._snapshot_buffer: list[Snapshot] = [] self._snapshots: list[Snapshot] = [] @@ -226,8 +233,8 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes version_infos = [ 'NVITOP {}'.format(__version__.partition('+')[0]), - f'Driver Version: {self.driver_version}', - f'CUDA Driver Version: {self.cuda_driver_version}', + f'{self.driver_version_label}: {self.driver_version}', + f'{self.cuda_driver_version_label}: {self.cuda_driver_version}', ] if sum(len(v) for v in version_infos) % 2 == 0: version_infos[0] += ' ' From dd9aeb7bcac7f8df3cfd3b7f3682529ab77f4b34 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:20:50 +0000 Subject: [PATCH 2/3] libmxsmi: cache mx-smi -L inventory separately with 60s TTL Agent-Logs-Url: https://github.com/mhson-kyle/nvitop/sessions/9e63a25a-5033-4588-bfdd-3fb0d64c9d9f Co-authored-by: mhson-kyle <72399227+mhson-kyle@users.noreply.github.com> --- nvitop/api/libmxsmi.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/nvitop/api/libmxsmi.py b/nvitop/api/libmxsmi.py index 2b1856e..079aee5 100644 --- a/nvitop/api/libmxsmi.py +++ b/nvitop/api/libmxsmi.py @@ -109,6 +109,15 @@ _CACHE_LOCK = threading.RLock() _CACHE: MxSmiSnapshot | None = None _CACHE_EXPIRES_AT = 0.0 +# Inventory data (UUID / name / bus_id) from ``mx-smi -L`` changes very rarely, +# so we keep it in a separate cache with a much longer TTL to avoid spawning an +# extra subprocess on every 0.25 s snapshot refresh. +_LIST_CACHE_TTL = 60.0 +_LIST_CACHE_LOCK = threading.RLock() +_LIST_CACHE: dict[int, DeviceInfo] | None = None +_LIST_CACHE_VERSION: str | NaType = NA +_LIST_CACHE_EXPIRES_AT = 0.0 + _LIST_RE = re.compile( r'^GPU#(?P\d+)\s+' r'(?P.+?)\s+' @@ -220,16 +229,40 @@ def snapshot(*, ttl: float = _CACHE_TTL) -> MxSmiSnapshot: def clear_cache() -> None: - """Clear the cached ``mx-smi`` snapshot.""" + """Clear the cached ``mx-smi`` snapshot and device inventory.""" global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement + global _LIST_CACHE, _LIST_CACHE_VERSION, _LIST_CACHE_EXPIRES_AT # pylint: disable=global-statement with _CACHE_LOCK: _CACHE = None _CACHE_EXPIRES_AT = 0.0 + with _LIST_CACHE_LOCK: + _LIST_CACHE = None + _LIST_CACHE_VERSION = NA + _LIST_CACHE_EXPIRES_AT = 0.0 + + +def _get_inventory_cache() -> tuple[dict[int, DeviceInfo], str | NaType]: + """Return the cached ``mx-smi -L`` inventory, refreshing when stale.""" + global _LIST_CACHE, _LIST_CACHE_VERSION, _LIST_CACHE_EXPIRES_AT # pylint: disable=global-statement + + now = time.monotonic() + with _LIST_CACHE_LOCK: + if _LIST_CACHE is not None and now < _LIST_CACHE_EXPIRES_AT: + return _LIST_CACHE, _LIST_CACHE_VERSION + + listed_devices, mxsmi_version = _parse_list_output(_run_mxsmi('-L')) + + with _LIST_CACHE_LOCK: + _LIST_CACHE = listed_devices + _LIST_CACHE_VERSION = mxsmi_version + _LIST_CACHE_EXPIRES_AT = time.monotonic() + _LIST_CACHE_TTL + return _LIST_CACHE, _LIST_CACHE_VERSION + def _take_snapshot() -> MxSmiSnapshot: - listed_devices, listed_mxsmi_version = _parse_list_output(_run_mxsmi('-L')) + listed_devices, listed_mxsmi_version = _get_inventory_cache() summary = _parse_summary_output(_run_mxsmi()) devices = listed_devices.copy() From 7336642d29919938e579a377a1cf5d14215cb3e1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:24:25 +0000 Subject: [PATCH 3/3] device: replace is_available() in _nvml_probe() with shutil.which check Agent-Logs-Url: https://github.com/mhson-kyle/nvitop/sessions/e5fd1e19-5d52-4ab0-ac60-5b545ffb9632 Co-authored-by: mhson-kyle <72399227+mhson-kyle@users.noreply.github.com> --- nvitop/api/device.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nvitop/api/device.py b/nvitop/api/device.py index 74df11d..e2b678d 100644 --- a/nvitop/api/device.py +++ b/nvitop/api/device.py @@ -109,6 +109,7 @@ import functools import multiprocessing as mp import os import re +import shutil import subprocess import sys import textwrap @@ -262,7 +263,7 @@ def _should_use_mxsmi_backend() -> bool: @contextlib.contextmanager def _nvml_probe() -> Generator[None]: - suppress_logs = libmxsmi.is_available() + suppress_logs = libmxsmi.is_forced() or shutil.which('mx-smi') is not None logger_disabled = libnvml.LOGGER.disabled if suppress_logs: libnvml.LOGGER.disabled = True