Add mx-smi MetaX GPU backend

This commit is contained in:
Kyle 2026-04-29 20:13:04 +08:00
parent a6761eb5c4
commit a306d69a36
5 changed files with 707 additions and 23 deletions

View file

@ -27,6 +27,7 @@ from nvitop.api import (
host,
libcuda,
libcudart,
libmxsmi,
libnvml,
process,
termcolor,
@ -46,6 +47,7 @@ for submodule in (
host,
libcuda,
libcudart,
libmxsmi,
libnvml,
process,
termcolor,

View file

@ -23,6 +23,7 @@ from nvitop.api import (
host,
libcuda,
libcudart,
libmxsmi,
libnvml,
process,
termcolor,
@ -69,6 +70,7 @@ __all__ = [ # noqa: RUF022
'NVMLError',
'nvmlCheckReturn',
'libnvml',
'libmxsmi',
'libcuda',
'libcudart',
# nvitop.api.device

View file

@ -117,7 +117,7 @@ import time
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, overload
from nvitop.api import host, libcuda, libcudart, libnvml
from nvitop.api import host, libcuda, libcudart, libmxsmi, libnvml
from nvitop.api.process import GpuProcess
from nvitop.api.utils import (
NA,
@ -240,6 +240,38 @@ _VALUE_OMITTED: str = ValueOmitted() # type: ignore[assignment]
del ValueOmitted
_ACTIVE_BACKEND: str | None = None
_ACTIVE_BACKEND_LOCK: threading.RLock = threading.RLock()
def _set_active_backend(backend: str) -> None:
global _ACTIVE_BACKEND # pylint: disable=global-statement
with _ACTIVE_BACKEND_LOCK:
_ACTIVE_BACKEND = backend
def _get_active_backend() -> str | None:
with _ACTIVE_BACKEND_LOCK:
return _ACTIVE_BACKEND
def _should_use_mxsmi_backend() -> bool:
return libmxsmi.is_forced() or _get_active_backend() == 'mx-smi'
@contextlib.contextmanager
def _nvml_probe() -> Generator[None]:
suppress_logs = libmxsmi.is_available()
logger_disabled = libnvml.LOGGER.disabled
if suppress_logs:
libnvml.LOGGER.disabled = True
try:
yield
finally:
libnvml.LOGGER.disabled = logger_disabled
class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods
"""Live class of the GPU devices, different from the device snapshots.
@ -333,9 +365,33 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""Test whether there are any devices and the NVML library is successfully loaded."""
try:
return cls.count() > 0
except libnvml.NVMLError:
except (libnvml.NVMLError, libmxsmi.MxSmiError):
return False
@staticmethod
def backend() -> str:
"""Return the active GPU query backend."""
active_backend = _get_active_backend()
if libmxsmi.is_forced():
return 'mx-smi'
if active_backend is not None:
return active_backend
try:
with _nvml_probe():
device_count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
if device_count > 0:
_set_active_backend('nvml')
return 'nvml'
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
return 'mx-smi'
raise
if libmxsmi.is_available():
_set_active_backend('mx-smi')
return 'mx-smi'
return 'nvml'
@staticmethod
def driver_version() -> str | NaType:
"""The version of the installed NVIDIA display driver. This is an alphanumeric string.
@ -355,7 +411,18 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
"""
return libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
if _should_use_mxsmi_backend():
return libmxsmi.driver_version()
try:
with _nvml_probe():
driver_version = libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
return libmxsmi.driver_version()
raise
_set_active_backend('nvml')
return driver_version
@staticmethod
def cuda_driver_version() -> str | NaType:
@ -375,7 +442,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
"""
cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
if _should_use_mxsmi_backend():
return libmxsmi.maca_version()
try:
with _nvml_probe():
cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
return libmxsmi.maca_version()
raise
_set_active_backend('nvml')
if libnvml.nvmlCheckReturn(cuda_driver_version, int):
major = cuda_driver_version // 1000
minor = (cuda_driver_version % 1000) // 10
@ -423,7 +500,22 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
"""
return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
if _should_use_mxsmi_backend():
return libmxsmi.device_count()
try:
with _nvml_probe():
count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
return libmxsmi.device_count()
raise
if count == 0 and libmxsmi.is_available():
_set_active_backend('mx-smi')
return libmxsmi.device_count()
if count > 0:
_set_active_backend('nvml')
return count
@classmethod
def all(cls) -> list[PhysicalDevice]:
@ -700,36 +792,50 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
self._is_mig_device: bool | None = None
self._cuda_index: int | None = None
self._cuda_compute_capability: tuple[int, int] | NaType | None = None
self._backend: str = 'nvml'
self._handle: libnvml.c_nvmlDevice_t | None
if index is not None:
if _should_use_mxsmi_backend():
self._init_mxsmi(index=index, uuid=uuid, bus_id=bus_id)
elif index is not None:
self._nvml_index = index # type: ignore[assignment]
try:
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByIndex',
index,
ignore_errors=False,
)
with _nvml_probe():
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByIndex',
index,
ignore_errors=False,
)
except libnvml.NVMLError_GpuIsLost:
self._handle = None
self._name = 'ERROR: GPU is Lost'
except libnvml.NVMLError_Unknown:
self._handle = None
self._name = 'ERROR: Unknown'
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
self._init_mxsmi(index=index)
else:
raise
else:
_set_active_backend('nvml')
else:
try:
if uuid is not None:
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByUUID',
uuid,
ignore_errors=False,
)
with _nvml_probe():
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByUUID',
uuid,
ignore_errors=False,
)
else:
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByPciBusId',
bus_id,
ignore_errors=False,
)
with _nvml_probe():
self._handle = libnvml.nvmlQuery(
'nvmlDeviceGetHandleByPciBusId',
bus_id,
ignore_errors=False,
)
except libnvml.NVMLError_GpuIsLost:
self._handle = None
self._nvml_index = NA # type: ignore[assignment]
@ -738,7 +844,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
self._handle = None
self._nvml_index = NA # type: ignore[assignment]
self._name = 'ERROR: Unknown'
except libnvml.NVMLError:
if libmxsmi.is_available():
_set_active_backend('mx-smi')
self._init_mxsmi(uuid=uuid, bus_id=bus_id)
else:
raise
else:
_set_active_backend('nvml')
self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
@ -747,6 +860,36 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
self._ident: tuple[Hashable, str] = (self.index, self.uuid())
self._hash: int | None = None
def _init_mxsmi(
self,
*,
index: int | tuple[int, int] | bytes | None = None,
uuid: bytes | None = None,
bus_id: bytes | None = None,
) -> None:
"""Initialize this device from the MetaX ``mx-smi`` backend."""
if isinstance(index, tuple):
raise libnvml.NVMLError_NotSupported
try:
info = libmxsmi.get_device(index=index, uuid=uuid, bus_id=bus_id)
except libmxsmi.MxSmiDeviceNotFound as ex:
raise libnvml.NVMLError_NotFound from ex
_set_active_backend('mx-smi')
self._backend = 'mx-smi'
self._handle = None
self._nvml_index = info.index
self._name = info.name
self._uuid = info.uuid
self._bus_id = info.bus_id
self._memory_total = info.memory_total
def _is_mxsmi_device(self) -> bool:
return self._backend == 'mx-smi'
def _mxsmi_info(self) -> libmxsmi.DeviceInfo:
return libmxsmi.get_device(index=self.physical_index)
def __repr__(self) -> str:
"""Return a string representation of the device."""
return '{}(index={}, name={!r}, total_memory={})'.format( # noqa: UP032
@ -904,6 +1047,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
"""
if self._is_mxsmi_device():
self._name = self._mxsmi_info().name
return self._name
if self._handle is not None and self._name is NA:
self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self._handle)
return self._name
@ -922,6 +1068,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
"""
if self._is_mxsmi_device():
self._uuid = self._mxsmi_info().uuid
return self._uuid
if self._handle is not None and self._uuid is NA:
self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self._handle)
return self._uuid
@ -938,6 +1087,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pci.bus_id
"""
if self._is_mxsmi_device():
self._bus_id = self._mxsmi_info().bus_id
return self._bus_id
if self._handle is not None and self._bus_id is NA:
self._bus_id = libnvml.nvmlQuery(
lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId,
@ -959,6 +1111,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=serial
"""
if self._is_mxsmi_device():
return NA
if self._handle is not None:
return libnvml.nvmlQuery('nvmlDeviceGetSerial', self._handle)
return NA
@ -970,6 +1124,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: MemoryInfo(total, free, used, reserved)
A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
"""
if self._is_mxsmi_device():
info = self._mxsmi_info()
return MemoryInfo(
total=info.memory_total,
free=info.memory_free,
used=info.memory_used,
reserved=NA,
)
if self._handle is not None:
has_unified_memory = False
try:
@ -1179,6 +1341,15 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: UtilizationRates(gpu, memory, encoder, decoder)
A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable.
""" # pylint: disable=line-too-long
if self._is_mxsmi_device():
info = self._mxsmi_info()
return UtilizationRates(
gpu=info.gpu_utilization,
memory=info.memory_utilization,
encoder=NA,
decoder=NA,
)
gpu, memory, encoder, decoder = NA, NA, NA, NA
if self._handle is not None:
@ -1449,6 +1620,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=fan.speed
""" # pylint: disable=line-too-long
if self._is_mxsmi_device():
return self._mxsmi_info().fan_speed
if self._handle is not None:
return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self._handle)
return NA
@ -1465,6 +1638,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=temperature.gpu
"""
if self._is_mxsmi_device():
return self._mxsmi_info().temperature
if self._handle is not None:
return libnvml.nvmlQuery(
'nvmlDeviceGetTemperature',
@ -1486,6 +1661,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 ))
"""
if self._is_mxsmi_device():
return self._mxsmi_info().power_usage
if self._handle is not None:
return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self._handle)
return NA
@ -1507,6 +1684,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 ))
"""
if self._is_mxsmi_device():
return self._mxsmi_info().power_limit
if self._handle is not None:
return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self._handle)
return NA
@ -1547,6 +1726,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: Union[int, NaType]
The current PCIe transmit throughput in KiB/s, or :const:`nvitop.NA` when not applicable.
"""
if self._is_mxsmi_device():
return NA
if self._handle is not None:
return libnvml.nvmlQuery(
'nvmlDeviceGetPcieThroughput',
@ -1565,6 +1746,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: Union[int, NaType]
The current PCIe receive throughput in KiB/s, or :const:`nvitop.NA` when not applicable.
"""
if self._is_mxsmi_device():
return NA
if self._handle is not None:
return libnvml.nvmlQuery(
'nvmlDeviceGetPcieThroughput',
@ -2131,6 +2314,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=persistence_mode
""" # pylint: disable=line-too-long
if self._is_mxsmi_device():
return self._mxsmi_info().persistence_mode
if self._handle is not None:
return {
0: 'Disabled',
@ -2150,6 +2335,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pstate
""" # pylint: disable=line-too-long
if self._is_mxsmi_device():
return self._mxsmi_info().performance_state
if self._handle is not None:
performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self._handle)
if libnvml.nvmlCheckReturn(performance_state, int):
@ -2194,6 +2381,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_mode
""" # pylint: disable=line-too-long
if self._is_mxsmi_device():
return 'Default'
if self._handle is not None:
return {
libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
@ -2215,6 +2404,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_cap
"""
if self._is_mxsmi_device():
return NA
if self._handle is not None:
if self._cuda_compute_capability is None:
self._cuda_compute_capability = libnvml.nvmlQuery(
@ -2226,6 +2417,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
def is_mig_device(self) -> bool:
"""Return whether or not the device is a MIG device."""
if self._is_mxsmi_device():
return False
if self._handle is not None:
if self._is_mig_device is None:
is_mig_device = libnvml.nvmlQuery(
@ -2253,6 +2446,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=mig.mode.current
"""
if self._is_mxsmi_device():
return NA
if self._handle is None:
return NA
if self.is_mig_device():
@ -2313,6 +2508,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: Dict[int, GpuProcess]
A dictionary mapping PID to GPU process instance.
"""
if self._is_mxsmi_device():
processes = {}
for process in libmxsmi.processes(self.physical_index):
processes[process.pid] = self.GPU_PROCESS_CLASS(
pid=process.pid,
device=self,
gpu_memory=process.used_memory,
type='C',
)
return processes
if self._handle is None:
return {}

467
nvitop/api/libmxsmi.py Normal file
View file

@ -0,0 +1,467 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2025 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for querying MetaX GPUs through ``mx-smi``."""
from __future__ import annotations
import os
import re
import shutil
import subprocess
import threading
import time
from dataclasses import dataclass, replace
from nvitop.api.utils import MiB, NA, NaType
__all__ = [
'DeviceInfo',
'MxSmiError',
'MxSmiDeviceNotFound',
'MxSmiNotFound',
'MxSmiSnapshot',
'ProcessInfo',
'clear_cache',
'device_count',
'driver_version',
'get_device',
'is_available',
'is_forced',
'maca_version',
'processes',
'snapshot',
]
@dataclass(frozen=True)
class DeviceInfo:
"""MetaX GPU device information collected from ``mx-smi``."""
index: int
name: str | NaType = NA
uuid: str | NaType = NA
bus_id: str | NaType = NA
state: str | NaType = NA
persistence_mode: str | NaType = NA
performance_state: str | NaType = NA
memory_total: int | NaType = NA
memory_used: int | NaType = NA
memory_free: int | NaType = NA
gpu_utilization: int | NaType = NA
memory_utilization: int | NaType = NA
temperature: int | NaType = NA
power_usage: int | NaType = NA
power_limit: int | NaType = NA
fan_speed: int | NaType = NA
@dataclass(frozen=True)
class ProcessInfo:
"""MetaX GPU process information collected from ``mx-smi``."""
gpu_index: int
pid: int
name: str | NaType = NA
used_memory: int | NaType = NA
@dataclass(frozen=True)
class MxSmiSnapshot:
"""A single ``mx-smi`` sample."""
devices: dict[int, DeviceInfo]
processes: list[ProcessInfo]
driver_version: str | NaType = NA
maca_version: str | NaType = NA
mxsmi_version: str | NaType = NA
class MxSmiError(RuntimeError):
"""Base exception for ``mx-smi`` query errors."""
class MxSmiNotFound(MxSmiError):
"""Raised when the ``mx-smi`` executable is not available."""
class MxSmiDeviceNotFound(MxSmiError):
"""Raised when a MetaX GPU device cannot be found."""
_BACKEND_ENVVAR = 'NVITOP_GPU_BACKEND'
_CACHE_TTL = 0.25
_CACHE_LOCK = threading.RLock()
_CACHE: MxSmiSnapshot | None = None
_CACHE_EXPIRES_AT = 0.0
_LIST_RE = re.compile(
r'^GPU#(?P<index>\d+)\s+'
r'(?P<name>.+?)\s+'
r'(?P<bus_id>[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])\s+'
r'(?P<state>.*?)\s+'
r'\(UUID:\s*(?P<uuid>[^)]+)\)\s*$',
)
_MXSMI_VERSION_RE = re.compile(r'\bmx-smi\s+version:\s*(?P<version>\S+)', flags=re.IGNORECASE)
_DRIVER_VERSION_RE = re.compile(r'Kernel Mode Driver Version:\s*(?P<version>[^\s|]+)')
_MACA_VERSION_RE = re.compile(r'MACA Version:\s*(?P<version>[^\s|]+)')
_SUMMARY_FIRST_RE = re.compile(
r'^(?P<index>\d+)\s+(?P<name>.+?)\s+(?P<persistence>On|Off|Enable|Disable|Enabled|Disabled)\s*$',
)
_GPU_UTIL_RE = re.compile(r'(?P<util>\d+(?:\.\d+)?)\s*%')
_SUMMARY_SECOND_RE = re.compile(
r'^(?P<temperature>\d+(?:\.\d+)?)C\s+'
r'(?P<power_usage>\d+(?:\.\d+)?)W\s*/\s*'
r'(?P<power_limit>\d+(?:\.\d+)?)W\s+'
r'(?P<performance_state>\S+)',
)
_MEMORY_RE = re.compile(
r'(?P<used>\d+(?:\.\d+)?)\s*/\s*(?P<total>\d+(?:\.\d+)?)\s*MiB',
flags=re.IGNORECASE,
)
_PROCESS_RE = re.compile(
r'^\|\s*(?P<gpu_index>\d+)\s+'
r'(?P<pid>\d+)\s+'
r'(?P<name>.*?)\s+'
r'(?P<used_memory>\d+(?:\.\d+)?|N/A)\s*\|\s*$',
)
def is_forced() -> bool:
"""Return whether the MetaX backend was explicitly requested."""
backend = os.getenv(_BACKEND_ENVVAR, default='').strip().lower().replace('_', '-')
return backend in {'mx-smi', 'mxsmi', 'metax'}
def is_available() -> bool:
"""Return whether ``mx-smi`` can see at least one MetaX GPU."""
if shutil.which('mx-smi') is None:
return False
try:
return device_count() > 0
except MxSmiError:
return False
def device_count() -> int:
"""Return the number of MetaX GPUs visible to ``mx-smi``."""
return len(snapshot().devices)
def driver_version() -> str | NaType:
"""Return the MetaX kernel mode driver version."""
return snapshot().driver_version
def maca_version() -> str | NaType:
"""Return the MACA runtime version reported by ``mx-smi``."""
return snapshot().maca_version
def get_device(
*,
index: int | bytes | None = None,
uuid: str | bytes | None = None,
bus_id: str | bytes | None = None,
) -> DeviceInfo:
"""Return a MetaX device by index, UUID, or PCI bus ID."""
if sum(arg is not None for arg in (index, uuid, bus_id)) != 1:
raise TypeError('get_device() expects exactly one identifier.')
devices = snapshot().devices
if index is not None:
try:
return devices[int(index)]
except (KeyError, TypeError, ValueError) as ex:
raise MxSmiDeviceNotFound(f'MetaX GPU index {index!r} was not found.') from ex
identifier = _normalize_identifier(uuid if uuid is not None else bus_id)
for device in devices.values():
if identifier in {_normalize_identifier(device.uuid), _normalize_identifier(device.bus_id)}:
return device
raise MxSmiDeviceNotFound(f'MetaX GPU {identifier!r} was not found.')
def processes(index: int) -> list[ProcessInfo]:
"""Return processes reported by ``mx-smi`` for the given GPU index."""
return [process for process in snapshot().processes if process.gpu_index == index]
def snapshot(*, ttl: float = _CACHE_TTL) -> MxSmiSnapshot:
"""Take or return a cached ``mx-smi`` snapshot."""
global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement
now = time.monotonic()
with _CACHE_LOCK:
if _CACHE is not None and now < _CACHE_EXPIRES_AT:
return _CACHE
current = _take_snapshot()
with _CACHE_LOCK:
_CACHE = current
_CACHE_EXPIRES_AT = time.monotonic() + ttl
return _CACHE
def clear_cache() -> None:
"""Clear the cached ``mx-smi`` snapshot."""
global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement
with _CACHE_LOCK:
_CACHE = None
_CACHE_EXPIRES_AT = 0.0
def _take_snapshot() -> MxSmiSnapshot:
listed_devices, listed_mxsmi_version = _parse_list_output(_run_mxsmi('-L'))
summary = _parse_summary_output(_run_mxsmi())
devices = listed_devices.copy()
for index, device in summary.devices.items():
base = devices.get(index, DeviceInfo(index=index))
devices[index] = replace(
base,
name=device.name if device.name is not NA else base.name,
bus_id=device.bus_id if device.bus_id is not NA else base.bus_id,
state=device.state if device.state is not NA else base.state,
persistence_mode=(
device.persistence_mode
if device.persistence_mode is not NA
else base.persistence_mode
),
performance_state=(
device.performance_state
if device.performance_state is not NA
else base.performance_state
),
memory_total=device.memory_total,
memory_used=device.memory_used,
memory_free=device.memory_free,
gpu_utilization=device.gpu_utilization,
memory_utilization=device.memory_utilization,
temperature=device.temperature,
power_usage=device.power_usage,
power_limit=device.power_limit,
fan_speed=device.fan_speed,
)
return MxSmiSnapshot(
devices=devices,
processes=summary.processes,
driver_version=summary.driver_version,
maca_version=summary.maca_version,
mxsmi_version=summary.mxsmi_version if summary.mxsmi_version is not NA else listed_mxsmi_version,
)
def _run_mxsmi(*args: str) -> str:
executable = shutil.which('mx-smi')
if executable is None:
raise MxSmiNotFound('The `mx-smi` executable was not found.')
command = [executable, *args]
try:
completed = subprocess.run( # noqa: S603
command,
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
encoding='utf-8',
errors='replace',
timeout=10.0,
)
except (OSError, subprocess.SubprocessError) as ex:
raise MxSmiError(f'Failed to run `{_command_to_string(command)}`.') from ex
if completed.returncode != 0:
output = completed.stdout.strip()
message = f'`{_command_to_string(command)}` exited with status {completed.returncode}.'
if output:
message = f'{message}\n{output}'
raise MxSmiError(message)
return completed.stdout
def _parse_list_output(output: str) -> tuple[dict[int, DeviceInfo], str | NaType]:
devices: dict[int, DeviceInfo] = {}
mxsmi_version: str | NaType = NA
for line in output.splitlines():
version_match = _MXSMI_VERSION_RE.search(line)
if version_match is not None:
mxsmi_version = version_match.group('version')
continue
match = _LIST_RE.match(line.strip())
if match is None:
continue
index = int(match.group('index'))
devices[index] = DeviceInfo(
index=index,
name=match.group('name').strip(),
uuid=match.group('uuid').strip(),
bus_id=match.group('bus_id').strip(),
state=match.group('state').strip() or NA,
)
return devices, mxsmi_version
def _parse_summary_output(output: str) -> MxSmiSnapshot:
devices: dict[int, DeviceInfo] = {}
processes: list[ProcessInfo] = []
driver_version: str | NaType = NA
maca_version: str | NaType = NA
mxsmi_version: str | NaType = NA
lines = output.splitlines()
for lineno, line in enumerate(lines):
version_match = _MXSMI_VERSION_RE.search(line)
if version_match is not None:
mxsmi_version = version_match.group('version')
driver_match = _DRIVER_VERSION_RE.search(line)
if driver_match is not None:
driver_version = driver_match.group('version')
maca_match = _MACA_VERSION_RE.search(line)
if maca_match is not None:
maca_version = maca_match.group('version')
parts = _split_table_line(line)
if len(parts) != 3:
continue
first_match = _SUMMARY_FIRST_RE.match(parts[0])
if first_match is None:
continue
try:
next_parts = _split_table_line(lines[lineno + 1])
except IndexError:
continue
if len(next_parts) != 3:
continue
second_match = _SUMMARY_SECOND_RE.match(next_parts[0])
memory_match = _MEMORY_RE.search(next_parts[1])
if second_match is None or memory_match is None:
continue
index = int(first_match.group('index'))
memory_used = _mib_to_bytes(memory_match.group('used'))
memory_total = _mib_to_bytes(memory_match.group('total'))
memory_free = (
memory_total - memory_used
if isinstance(memory_total, int) and isinstance(memory_used, int)
else NA
)
devices[index] = DeviceInfo(
index=index,
name=first_match.group('name').strip(),
bus_id=parts[1].strip(),
state=next_parts[2].strip() or NA,
persistence_mode=_normalize_mode(first_match.group('persistence')),
performance_state=second_match.group('performance_state'),
memory_total=memory_total,
memory_used=memory_used,
memory_free=memory_free,
gpu_utilization=_percent_to_int(parts[2]),
temperature=round(float(second_match.group('temperature'))),
power_usage=_watts_to_milliwatts(second_match.group('power_usage')),
power_limit=_watts_to_milliwatts(second_match.group('power_limit')),
)
in_process_table = False
for line in lines:
if '| Process:' in line:
in_process_table = True
continue
if not in_process_table:
continue
if 'no process found' in line.lower():
continue
process_match = _PROCESS_RE.match(line)
if process_match is None:
continue
processes.append(
ProcessInfo(
gpu_index=int(process_match.group('gpu_index')),
pid=int(process_match.group('pid')),
name=process_match.group('name').strip() or NA,
used_memory=_mib_to_bytes(process_match.group('used_memory')),
),
)
return MxSmiSnapshot(
devices=devices,
processes=processes,
driver_version=driver_version,
maca_version=maca_version,
mxsmi_version=mxsmi_version,
)
def _split_table_line(line: str) -> list[str]:
if not line.startswith('|'):
return []
return [part.strip() for part in line.strip().strip('|').split('|')]
def _mib_to_bytes(value: str) -> int | NaType:
if value.upper() == 'N/A':
return NA
return round(float(value) * MiB)
def _watts_to_milliwatts(value: str) -> int:
return round(float(value) * 1000)
def _percent_to_int(value: str) -> int | NaType:
match = _GPU_UTIL_RE.search(value)
if match is None:
return NA
return round(float(match.group('util')))
def _normalize_mode(value: str) -> str | NaType:
normalized = value.strip().lower()
if normalized in {'on', 'enable', 'enabled'}:
return 'Enabled'
if normalized in {'off', 'disable', 'disabled'}:
return 'Disabled'
return NA
def _normalize_identifier(value: str | bytes | NaType | None) -> str:
if isinstance(value, bytes):
value = value.decode('utf-8', errors='replace')
if value is None or value is NA:
return ''
return str(value).strip().lower()
def _command_to_string(command: list[str]) -> str:
return ' '.join(command)

View file

@ -79,8 +79,15 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
if self.device_count == 0:
self.height = self.full_height = self.compact_height = 6
self.backend: str = Device.backend()
self.driver_version: str = Device.driver_version()
self.cuda_driver_version: str = Device.cuda_driver_version()
self.driver_version_label: str = (
'KMD Version' if self.backend == 'mx-smi' else 'Driver Version'
)
self.cuda_driver_version_label: str = (
'MACA Version' if self.backend == 'mx-smi' else 'CUDA Driver Version'
)
self._snapshot_buffer: list[Snapshot] = []
self._snapshots: list[Snapshot] = []
@ -226,8 +233,8 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
version_infos = [
'NVITOP {}'.format(__version__.partition('+')[0]),
f'Driver Version: {self.driver_version}',
f'CUDA Driver Version: {self.cuda_driver_version}',
f'{self.driver_version_label}: {self.driver_version}',
f'{self.cuda_driver_version_label}: {self.cuda_driver_version}',
]
if sum(len(v) for v in version_infos) % 2 == 0:
version_infos[0] += ' '