mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 14:15:55 -06:00
Add mx-smi MetaX GPU backend
This commit is contained in:
parent
a6761eb5c4
commit
a306d69a36
5 changed files with 707 additions and 23 deletions
|
|
@ -27,6 +27,7 @@ from nvitop.api import (
|
|||
host,
|
||||
libcuda,
|
||||
libcudart,
|
||||
libmxsmi,
|
||||
libnvml,
|
||||
process,
|
||||
termcolor,
|
||||
|
|
@ -46,6 +47,7 @@ for submodule in (
|
|||
host,
|
||||
libcuda,
|
||||
libcudart,
|
||||
libmxsmi,
|
||||
libnvml,
|
||||
process,
|
||||
termcolor,
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from nvitop.api import (
|
|||
host,
|
||||
libcuda,
|
||||
libcudart,
|
||||
libmxsmi,
|
||||
libnvml,
|
||||
process,
|
||||
termcolor,
|
||||
|
|
@ -69,6 +70,7 @@ __all__ = [ # noqa: RUF022
|
|||
'NVMLError',
|
||||
'nvmlCheckReturn',
|
||||
'libnvml',
|
||||
'libmxsmi',
|
||||
'libcuda',
|
||||
'libcudart',
|
||||
# nvitop.api.device
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ import time
|
|||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, overload
|
||||
|
||||
from nvitop.api import host, libcuda, libcudart, libnvml
|
||||
from nvitop.api import host, libcuda, libcudart, libmxsmi, libnvml
|
||||
from nvitop.api.process import GpuProcess
|
||||
from nvitop.api.utils import (
|
||||
NA,
|
||||
|
|
@ -240,6 +240,38 @@ _VALUE_OMITTED: str = ValueOmitted() # type: ignore[assignment]
|
|||
del ValueOmitted
|
||||
|
||||
|
||||
_ACTIVE_BACKEND: str | None = None
|
||||
_ACTIVE_BACKEND_LOCK: threading.RLock = threading.RLock()
|
||||
|
||||
|
||||
def _set_active_backend(backend: str) -> None:
|
||||
global _ACTIVE_BACKEND # pylint: disable=global-statement
|
||||
|
||||
with _ACTIVE_BACKEND_LOCK:
|
||||
_ACTIVE_BACKEND = backend
|
||||
|
||||
|
||||
def _get_active_backend() -> str | None:
|
||||
with _ACTIVE_BACKEND_LOCK:
|
||||
return _ACTIVE_BACKEND
|
||||
|
||||
|
||||
def _should_use_mxsmi_backend() -> bool:
|
||||
return libmxsmi.is_forced() or _get_active_backend() == 'mx-smi'
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _nvml_probe() -> Generator[None]:
|
||||
suppress_logs = libmxsmi.is_available()
|
||||
logger_disabled = libnvml.LOGGER.disabled
|
||||
if suppress_logs:
|
||||
libnvml.LOGGER.disabled = True
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
libnvml.LOGGER.disabled = logger_disabled
|
||||
|
||||
|
||||
class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods
|
||||
"""Live class of the GPU devices, different from the device snapshots.
|
||||
|
||||
|
|
@ -333,9 +365,33 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
"""Test whether there are any devices and the NVML library is successfully loaded."""
|
||||
try:
|
||||
return cls.count() > 0
|
||||
except libnvml.NVMLError:
|
||||
except (libnvml.NVMLError, libmxsmi.MxSmiError):
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def backend() -> str:
|
||||
"""Return the active GPU query backend."""
|
||||
active_backend = _get_active_backend()
|
||||
if libmxsmi.is_forced():
|
||||
return 'mx-smi'
|
||||
if active_backend is not None:
|
||||
return active_backend
|
||||
try:
|
||||
with _nvml_probe():
|
||||
device_count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
|
||||
if device_count > 0:
|
||||
_set_active_backend('nvml')
|
||||
return 'nvml'
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return 'mx-smi'
|
||||
raise
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return 'mx-smi'
|
||||
return 'nvml'
|
||||
|
||||
@staticmethod
|
||||
def driver_version() -> str | NaType:
|
||||
"""The version of the installed NVIDIA display driver. This is an alphanumeric string.
|
||||
|
|
@ -355,7 +411,18 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
||||
driver without reloading the kernel module.
|
||||
"""
|
||||
return libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
|
||||
if _should_use_mxsmi_backend():
|
||||
return libmxsmi.driver_version()
|
||||
try:
|
||||
with _nvml_probe():
|
||||
driver_version = libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return libmxsmi.driver_version()
|
||||
raise
|
||||
_set_active_backend('nvml')
|
||||
return driver_version
|
||||
|
||||
@staticmethod
|
||||
def cuda_driver_version() -> str | NaType:
|
||||
|
|
@ -375,7 +442,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
||||
driver without reloading the kernel module.
|
||||
"""
|
||||
if _should_use_mxsmi_backend():
|
||||
return libmxsmi.maca_version()
|
||||
try:
|
||||
with _nvml_probe():
|
||||
cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return libmxsmi.maca_version()
|
||||
raise
|
||||
_set_active_backend('nvml')
|
||||
if libnvml.nvmlCheckReturn(cuda_driver_version, int):
|
||||
major = cuda_driver_version // 1000
|
||||
minor = (cuda_driver_version % 1000) // 10
|
||||
|
|
@ -423,7 +500,22 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
||||
driver without reloading the kernel module.
|
||||
"""
|
||||
return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
|
||||
if _should_use_mxsmi_backend():
|
||||
return libmxsmi.device_count()
|
||||
try:
|
||||
with _nvml_probe():
|
||||
count = libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return libmxsmi.device_count()
|
||||
raise
|
||||
if count == 0 and libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
return libmxsmi.device_count()
|
||||
if count > 0:
|
||||
_set_active_backend('nvml')
|
||||
return count
|
||||
|
||||
@classmethod
|
||||
def all(cls) -> list[PhysicalDevice]:
|
||||
|
|
@ -700,11 +792,15 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
self._is_mig_device: bool | None = None
|
||||
self._cuda_index: int | None = None
|
||||
self._cuda_compute_capability: tuple[int, int] | NaType | None = None
|
||||
self._backend: str = 'nvml'
|
||||
|
||||
self._handle: libnvml.c_nvmlDevice_t | None
|
||||
if index is not None:
|
||||
if _should_use_mxsmi_backend():
|
||||
self._init_mxsmi(index=index, uuid=uuid, bus_id=bus_id)
|
||||
elif index is not None:
|
||||
self._nvml_index = index # type: ignore[assignment]
|
||||
try:
|
||||
with _nvml_probe():
|
||||
self._handle = libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetHandleByIndex',
|
||||
index,
|
||||
|
|
@ -716,15 +812,25 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
except libnvml.NVMLError_Unknown:
|
||||
self._handle = None
|
||||
self._name = 'ERROR: Unknown'
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
self._init_mxsmi(index=index)
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
_set_active_backend('nvml')
|
||||
else:
|
||||
try:
|
||||
if uuid is not None:
|
||||
with _nvml_probe():
|
||||
self._handle = libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetHandleByUUID',
|
||||
uuid,
|
||||
ignore_errors=False,
|
||||
)
|
||||
else:
|
||||
with _nvml_probe():
|
||||
self._handle = libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetHandleByPciBusId',
|
||||
bus_id,
|
||||
|
|
@ -738,7 +844,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
self._handle = None
|
||||
self._nvml_index = NA # type: ignore[assignment]
|
||||
self._name = 'ERROR: Unknown'
|
||||
except libnvml.NVMLError:
|
||||
if libmxsmi.is_available():
|
||||
_set_active_backend('mx-smi')
|
||||
self._init_mxsmi(uuid=uuid, bus_id=bus_id)
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
_set_active_backend('nvml')
|
||||
self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
|
||||
|
||||
self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
|
||||
|
|
@ -747,6 +860,36 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
self._ident: tuple[Hashable, str] = (self.index, self.uuid())
|
||||
self._hash: int | None = None
|
||||
|
||||
def _init_mxsmi(
|
||||
self,
|
||||
*,
|
||||
index: int | tuple[int, int] | bytes | None = None,
|
||||
uuid: bytes | None = None,
|
||||
bus_id: bytes | None = None,
|
||||
) -> None:
|
||||
"""Initialize this device from the MetaX ``mx-smi`` backend."""
|
||||
if isinstance(index, tuple):
|
||||
raise libnvml.NVMLError_NotSupported
|
||||
try:
|
||||
info = libmxsmi.get_device(index=index, uuid=uuid, bus_id=bus_id)
|
||||
except libmxsmi.MxSmiDeviceNotFound as ex:
|
||||
raise libnvml.NVMLError_NotFound from ex
|
||||
|
||||
_set_active_backend('mx-smi')
|
||||
self._backend = 'mx-smi'
|
||||
self._handle = None
|
||||
self._nvml_index = info.index
|
||||
self._name = info.name
|
||||
self._uuid = info.uuid
|
||||
self._bus_id = info.bus_id
|
||||
self._memory_total = info.memory_total
|
||||
|
||||
def _is_mxsmi_device(self) -> bool:
|
||||
return self._backend == 'mx-smi'
|
||||
|
||||
def _mxsmi_info(self) -> libmxsmi.DeviceInfo:
|
||||
return libmxsmi.get_device(index=self.physical_index)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Return a string representation of the device."""
|
||||
return '{}(index={}, name={!r}, total_memory={})'.format( # noqa: UP032
|
||||
|
|
@ -904,6 +1047,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
self._name = self._mxsmi_info().name
|
||||
return self._name
|
||||
if self._handle is not None and self._name is NA:
|
||||
self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self._handle)
|
||||
return self._name
|
||||
|
|
@ -922,6 +1068,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
self._uuid = self._mxsmi_info().uuid
|
||||
return self._uuid
|
||||
if self._handle is not None and self._uuid is NA:
|
||||
self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self._handle)
|
||||
return self._uuid
|
||||
|
|
@ -938,6 +1087,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pci.bus_id
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
self._bus_id = self._mxsmi_info().bus_id
|
||||
return self._bus_id
|
||||
if self._handle is not None and self._bus_id is NA:
|
||||
self._bus_id = libnvml.nvmlQuery(
|
||||
lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId,
|
||||
|
|
@ -959,6 +1111,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=serial
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return NA
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery('nvmlDeviceGetSerial', self._handle)
|
||||
return NA
|
||||
|
|
@ -970,6 +1124,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
Returns: MemoryInfo(total, free, used, reserved)
|
||||
A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
info = self._mxsmi_info()
|
||||
return MemoryInfo(
|
||||
total=info.memory_total,
|
||||
free=info.memory_free,
|
||||
used=info.memory_used,
|
||||
reserved=NA,
|
||||
)
|
||||
if self._handle is not None:
|
||||
has_unified_memory = False
|
||||
try:
|
||||
|
|
@ -1179,6 +1341,15 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
Returns: UtilizationRates(gpu, memory, encoder, decoder)
|
||||
A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable.
|
||||
""" # pylint: disable=line-too-long
|
||||
if self._is_mxsmi_device():
|
||||
info = self._mxsmi_info()
|
||||
return UtilizationRates(
|
||||
gpu=info.gpu_utilization,
|
||||
memory=info.memory_utilization,
|
||||
encoder=NA,
|
||||
decoder=NA,
|
||||
)
|
||||
|
||||
gpu, memory, encoder, decoder = NA, NA, NA, NA
|
||||
|
||||
if self._handle is not None:
|
||||
|
|
@ -1449,6 +1620,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=fan.speed
|
||||
""" # pylint: disable=line-too-long
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().fan_speed
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self._handle)
|
||||
return NA
|
||||
|
|
@ -1465,6 +1638,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=temperature.gpu
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().temperature
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetTemperature',
|
||||
|
|
@ -1486,6 +1661,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 ))
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().power_usage
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self._handle)
|
||||
return NA
|
||||
|
|
@ -1507,6 +1684,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 ))
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().power_limit
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self._handle)
|
||||
return NA
|
||||
|
|
@ -1547,6 +1726,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
Returns: Union[int, NaType]
|
||||
The current PCIe transmit throughput in KiB/s, or :const:`nvitop.NA` when not applicable.
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return NA
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetPcieThroughput',
|
||||
|
|
@ -1565,6 +1746,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
Returns: Union[int, NaType]
|
||||
The current PCIe receive throughput in KiB/s, or :const:`nvitop.NA` when not applicable.
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return NA
|
||||
if self._handle is not None:
|
||||
return libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetPcieThroughput',
|
||||
|
|
@ -2131,6 +2314,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=persistence_mode
|
||||
""" # pylint: disable=line-too-long
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().persistence_mode
|
||||
if self._handle is not None:
|
||||
return {
|
||||
0: 'Disabled',
|
||||
|
|
@ -2150,6 +2335,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pstate
|
||||
""" # pylint: disable=line-too-long
|
||||
if self._is_mxsmi_device():
|
||||
return self._mxsmi_info().performance_state
|
||||
if self._handle is not None:
|
||||
performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self._handle)
|
||||
if libnvml.nvmlCheckReturn(performance_state, int):
|
||||
|
|
@ -2194,6 +2381,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_mode
|
||||
""" # pylint: disable=line-too-long
|
||||
if self._is_mxsmi_device():
|
||||
return 'Default'
|
||||
if self._handle is not None:
|
||||
return {
|
||||
libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
|
||||
|
|
@ -2215,6 +2404,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_cap
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return NA
|
||||
if self._handle is not None:
|
||||
if self._cuda_compute_capability is None:
|
||||
self._cuda_compute_capability = libnvml.nvmlQuery(
|
||||
|
|
@ -2226,6 +2417,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
def is_mig_device(self) -> bool:
|
||||
"""Return whether or not the device is a MIG device."""
|
||||
if self._is_mxsmi_device():
|
||||
return False
|
||||
if self._handle is not None:
|
||||
if self._is_mig_device is None:
|
||||
is_mig_device = libnvml.nvmlQuery(
|
||||
|
|
@ -2253,6 +2446,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=mig.mode.current
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
return NA
|
||||
if self._handle is None:
|
||||
return NA
|
||||
if self.is_mig_device():
|
||||
|
|
@ -2313,6 +2508,17 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
Returns: Dict[int, GpuProcess]
|
||||
A dictionary mapping PID to GPU process instance.
|
||||
"""
|
||||
if self._is_mxsmi_device():
|
||||
processes = {}
|
||||
for process in libmxsmi.processes(self.physical_index):
|
||||
processes[process.pid] = self.GPU_PROCESS_CLASS(
|
||||
pid=process.pid,
|
||||
device=self,
|
||||
gpu_memory=process.used_memory,
|
||||
type='C',
|
||||
)
|
||||
return processes
|
||||
|
||||
if self._handle is None:
|
||||
return {}
|
||||
|
||||
|
|
|
|||
467
nvitop/api/libmxsmi.py
Normal file
467
nvitop/api/libmxsmi.py
Normal file
|
|
@ -0,0 +1,467 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2025 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Utilities for querying MetaX GPUs through ``mx-smi``."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, replace
|
||||
|
||||
from nvitop.api.utils import MiB, NA, NaType
|
||||
|
||||
|
||||
__all__ = [
|
||||
'DeviceInfo',
|
||||
'MxSmiError',
|
||||
'MxSmiDeviceNotFound',
|
||||
'MxSmiNotFound',
|
||||
'MxSmiSnapshot',
|
||||
'ProcessInfo',
|
||||
'clear_cache',
|
||||
'device_count',
|
||||
'driver_version',
|
||||
'get_device',
|
||||
'is_available',
|
||||
'is_forced',
|
||||
'maca_version',
|
||||
'processes',
|
||||
'snapshot',
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeviceInfo:
|
||||
"""MetaX GPU device information collected from ``mx-smi``."""
|
||||
|
||||
index: int
|
||||
name: str | NaType = NA
|
||||
uuid: str | NaType = NA
|
||||
bus_id: str | NaType = NA
|
||||
state: str | NaType = NA
|
||||
persistence_mode: str | NaType = NA
|
||||
performance_state: str | NaType = NA
|
||||
memory_total: int | NaType = NA
|
||||
memory_used: int | NaType = NA
|
||||
memory_free: int | NaType = NA
|
||||
gpu_utilization: int | NaType = NA
|
||||
memory_utilization: int | NaType = NA
|
||||
temperature: int | NaType = NA
|
||||
power_usage: int | NaType = NA
|
||||
power_limit: int | NaType = NA
|
||||
fan_speed: int | NaType = NA
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessInfo:
|
||||
"""MetaX GPU process information collected from ``mx-smi``."""
|
||||
|
||||
gpu_index: int
|
||||
pid: int
|
||||
name: str | NaType = NA
|
||||
used_memory: int | NaType = NA
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MxSmiSnapshot:
|
||||
"""A single ``mx-smi`` sample."""
|
||||
|
||||
devices: dict[int, DeviceInfo]
|
||||
processes: list[ProcessInfo]
|
||||
driver_version: str | NaType = NA
|
||||
maca_version: str | NaType = NA
|
||||
mxsmi_version: str | NaType = NA
|
||||
|
||||
|
||||
class MxSmiError(RuntimeError):
|
||||
"""Base exception for ``mx-smi`` query errors."""
|
||||
|
||||
|
||||
class MxSmiNotFound(MxSmiError):
|
||||
"""Raised when the ``mx-smi`` executable is not available."""
|
||||
|
||||
|
||||
class MxSmiDeviceNotFound(MxSmiError):
|
||||
"""Raised when a MetaX GPU device cannot be found."""
|
||||
|
||||
|
||||
_BACKEND_ENVVAR = 'NVITOP_GPU_BACKEND'
|
||||
_CACHE_TTL = 0.25
|
||||
_CACHE_LOCK = threading.RLock()
|
||||
_CACHE: MxSmiSnapshot | None = None
|
||||
_CACHE_EXPIRES_AT = 0.0
|
||||
|
||||
_LIST_RE = re.compile(
|
||||
r'^GPU#(?P<index>\d+)\s+'
|
||||
r'(?P<name>.+?)\s+'
|
||||
r'(?P<bus_id>[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])\s+'
|
||||
r'(?P<state>.*?)\s+'
|
||||
r'\(UUID:\s*(?P<uuid>[^)]+)\)\s*$',
|
||||
)
|
||||
_MXSMI_VERSION_RE = re.compile(r'\bmx-smi\s+version:\s*(?P<version>\S+)', flags=re.IGNORECASE)
|
||||
_DRIVER_VERSION_RE = re.compile(r'Kernel Mode Driver Version:\s*(?P<version>[^\s|]+)')
|
||||
_MACA_VERSION_RE = re.compile(r'MACA Version:\s*(?P<version>[^\s|]+)')
|
||||
_SUMMARY_FIRST_RE = re.compile(
|
||||
r'^(?P<index>\d+)\s+(?P<name>.+?)\s+(?P<persistence>On|Off|Enable|Disable|Enabled|Disabled)\s*$',
|
||||
)
|
||||
_GPU_UTIL_RE = re.compile(r'(?P<util>\d+(?:\.\d+)?)\s*%')
|
||||
_SUMMARY_SECOND_RE = re.compile(
|
||||
r'^(?P<temperature>\d+(?:\.\d+)?)C\s+'
|
||||
r'(?P<power_usage>\d+(?:\.\d+)?)W\s*/\s*'
|
||||
r'(?P<power_limit>\d+(?:\.\d+)?)W\s+'
|
||||
r'(?P<performance_state>\S+)',
|
||||
)
|
||||
_MEMORY_RE = re.compile(
|
||||
r'(?P<used>\d+(?:\.\d+)?)\s*/\s*(?P<total>\d+(?:\.\d+)?)\s*MiB',
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
_PROCESS_RE = re.compile(
|
||||
r'^\|\s*(?P<gpu_index>\d+)\s+'
|
||||
r'(?P<pid>\d+)\s+'
|
||||
r'(?P<name>.*?)\s+'
|
||||
r'(?P<used_memory>\d+(?:\.\d+)?|N/A)\s*\|\s*$',
|
||||
)
|
||||
|
||||
|
||||
def is_forced() -> bool:
|
||||
"""Return whether the MetaX backend was explicitly requested."""
|
||||
backend = os.getenv(_BACKEND_ENVVAR, default='').strip().lower().replace('_', '-')
|
||||
return backend in {'mx-smi', 'mxsmi', 'metax'}
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Return whether ``mx-smi`` can see at least one MetaX GPU."""
|
||||
if shutil.which('mx-smi') is None:
|
||||
return False
|
||||
try:
|
||||
return device_count() > 0
|
||||
except MxSmiError:
|
||||
return False
|
||||
|
||||
|
||||
def device_count() -> int:
|
||||
"""Return the number of MetaX GPUs visible to ``mx-smi``."""
|
||||
return len(snapshot().devices)
|
||||
|
||||
|
||||
def driver_version() -> str | NaType:
|
||||
"""Return the MetaX kernel mode driver version."""
|
||||
return snapshot().driver_version
|
||||
|
||||
|
||||
def maca_version() -> str | NaType:
|
||||
"""Return the MACA runtime version reported by ``mx-smi``."""
|
||||
return snapshot().maca_version
|
||||
|
||||
|
||||
def get_device(
|
||||
*,
|
||||
index: int | bytes | None = None,
|
||||
uuid: str | bytes | None = None,
|
||||
bus_id: str | bytes | None = None,
|
||||
) -> DeviceInfo:
|
||||
"""Return a MetaX device by index, UUID, or PCI bus ID."""
|
||||
if sum(arg is not None for arg in (index, uuid, bus_id)) != 1:
|
||||
raise TypeError('get_device() expects exactly one identifier.')
|
||||
|
||||
devices = snapshot().devices
|
||||
if index is not None:
|
||||
try:
|
||||
return devices[int(index)]
|
||||
except (KeyError, TypeError, ValueError) as ex:
|
||||
raise MxSmiDeviceNotFound(f'MetaX GPU index {index!r} was not found.') from ex
|
||||
|
||||
identifier = _normalize_identifier(uuid if uuid is not None else bus_id)
|
||||
for device in devices.values():
|
||||
if identifier in {_normalize_identifier(device.uuid), _normalize_identifier(device.bus_id)}:
|
||||
return device
|
||||
|
||||
raise MxSmiDeviceNotFound(f'MetaX GPU {identifier!r} was not found.')
|
||||
|
||||
|
||||
def processes(index: int) -> list[ProcessInfo]:
|
||||
"""Return processes reported by ``mx-smi`` for the given GPU index."""
|
||||
return [process for process in snapshot().processes if process.gpu_index == index]
|
||||
|
||||
|
||||
def snapshot(*, ttl: float = _CACHE_TTL) -> MxSmiSnapshot:
|
||||
"""Take or return a cached ``mx-smi`` snapshot."""
|
||||
global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement
|
||||
|
||||
now = time.monotonic()
|
||||
with _CACHE_LOCK:
|
||||
if _CACHE is not None and now < _CACHE_EXPIRES_AT:
|
||||
return _CACHE
|
||||
|
||||
current = _take_snapshot()
|
||||
|
||||
with _CACHE_LOCK:
|
||||
_CACHE = current
|
||||
_CACHE_EXPIRES_AT = time.monotonic() + ttl
|
||||
return _CACHE
|
||||
|
||||
|
||||
def clear_cache() -> None:
|
||||
"""Clear the cached ``mx-smi`` snapshot."""
|
||||
global _CACHE, _CACHE_EXPIRES_AT # pylint: disable=global-statement
|
||||
|
||||
with _CACHE_LOCK:
|
||||
_CACHE = None
|
||||
_CACHE_EXPIRES_AT = 0.0
|
||||
|
||||
|
||||
def _take_snapshot() -> MxSmiSnapshot:
|
||||
listed_devices, listed_mxsmi_version = _parse_list_output(_run_mxsmi('-L'))
|
||||
summary = _parse_summary_output(_run_mxsmi())
|
||||
|
||||
devices = listed_devices.copy()
|
||||
for index, device in summary.devices.items():
|
||||
base = devices.get(index, DeviceInfo(index=index))
|
||||
devices[index] = replace(
|
||||
base,
|
||||
name=device.name if device.name is not NA else base.name,
|
||||
bus_id=device.bus_id if device.bus_id is not NA else base.bus_id,
|
||||
state=device.state if device.state is not NA else base.state,
|
||||
persistence_mode=(
|
||||
device.persistence_mode
|
||||
if device.persistence_mode is not NA
|
||||
else base.persistence_mode
|
||||
),
|
||||
performance_state=(
|
||||
device.performance_state
|
||||
if device.performance_state is not NA
|
||||
else base.performance_state
|
||||
),
|
||||
memory_total=device.memory_total,
|
||||
memory_used=device.memory_used,
|
||||
memory_free=device.memory_free,
|
||||
gpu_utilization=device.gpu_utilization,
|
||||
memory_utilization=device.memory_utilization,
|
||||
temperature=device.temperature,
|
||||
power_usage=device.power_usage,
|
||||
power_limit=device.power_limit,
|
||||
fan_speed=device.fan_speed,
|
||||
)
|
||||
|
||||
return MxSmiSnapshot(
|
||||
devices=devices,
|
||||
processes=summary.processes,
|
||||
driver_version=summary.driver_version,
|
||||
maca_version=summary.maca_version,
|
||||
mxsmi_version=summary.mxsmi_version if summary.mxsmi_version is not NA else listed_mxsmi_version,
|
||||
)
|
||||
|
||||
|
||||
def _run_mxsmi(*args: str) -> str:
|
||||
executable = shutil.which('mx-smi')
|
||||
if executable is None:
|
||||
raise MxSmiNotFound('The `mx-smi` executable was not found.')
|
||||
|
||||
command = [executable, *args]
|
||||
try:
|
||||
completed = subprocess.run( # noqa: S603
|
||||
command,
|
||||
check=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
errors='replace',
|
||||
timeout=10.0,
|
||||
)
|
||||
except (OSError, subprocess.SubprocessError) as ex:
|
||||
raise MxSmiError(f'Failed to run `{_command_to_string(command)}`.') from ex
|
||||
|
||||
if completed.returncode != 0:
|
||||
output = completed.stdout.strip()
|
||||
message = f'`{_command_to_string(command)}` exited with status {completed.returncode}.'
|
||||
if output:
|
||||
message = f'{message}\n{output}'
|
||||
raise MxSmiError(message)
|
||||
|
||||
return completed.stdout
|
||||
|
||||
|
||||
def _parse_list_output(output: str) -> tuple[dict[int, DeviceInfo], str | NaType]:
|
||||
devices: dict[int, DeviceInfo] = {}
|
||||
mxsmi_version: str | NaType = NA
|
||||
|
||||
for line in output.splitlines():
|
||||
version_match = _MXSMI_VERSION_RE.search(line)
|
||||
if version_match is not None:
|
||||
mxsmi_version = version_match.group('version')
|
||||
continue
|
||||
|
||||
match = _LIST_RE.match(line.strip())
|
||||
if match is None:
|
||||
continue
|
||||
|
||||
index = int(match.group('index'))
|
||||
devices[index] = DeviceInfo(
|
||||
index=index,
|
||||
name=match.group('name').strip(),
|
||||
uuid=match.group('uuid').strip(),
|
||||
bus_id=match.group('bus_id').strip(),
|
||||
state=match.group('state').strip() or NA,
|
||||
)
|
||||
|
||||
return devices, mxsmi_version
|
||||
|
||||
|
||||
def _parse_summary_output(output: str) -> MxSmiSnapshot:
|
||||
devices: dict[int, DeviceInfo] = {}
|
||||
processes: list[ProcessInfo] = []
|
||||
driver_version: str | NaType = NA
|
||||
maca_version: str | NaType = NA
|
||||
mxsmi_version: str | NaType = NA
|
||||
lines = output.splitlines()
|
||||
|
||||
for lineno, line in enumerate(lines):
|
||||
version_match = _MXSMI_VERSION_RE.search(line)
|
||||
if version_match is not None:
|
||||
mxsmi_version = version_match.group('version')
|
||||
|
||||
driver_match = _DRIVER_VERSION_RE.search(line)
|
||||
if driver_match is not None:
|
||||
driver_version = driver_match.group('version')
|
||||
|
||||
maca_match = _MACA_VERSION_RE.search(line)
|
||||
if maca_match is not None:
|
||||
maca_version = maca_match.group('version')
|
||||
|
||||
parts = _split_table_line(line)
|
||||
if len(parts) != 3:
|
||||
continue
|
||||
|
||||
first_match = _SUMMARY_FIRST_RE.match(parts[0])
|
||||
if first_match is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
next_parts = _split_table_line(lines[lineno + 1])
|
||||
except IndexError:
|
||||
continue
|
||||
if len(next_parts) != 3:
|
||||
continue
|
||||
|
||||
second_match = _SUMMARY_SECOND_RE.match(next_parts[0])
|
||||
memory_match = _MEMORY_RE.search(next_parts[1])
|
||||
if second_match is None or memory_match is None:
|
||||
continue
|
||||
|
||||
index = int(first_match.group('index'))
|
||||
memory_used = _mib_to_bytes(memory_match.group('used'))
|
||||
memory_total = _mib_to_bytes(memory_match.group('total'))
|
||||
memory_free = (
|
||||
memory_total - memory_used
|
||||
if isinstance(memory_total, int) and isinstance(memory_used, int)
|
||||
else NA
|
||||
)
|
||||
devices[index] = DeviceInfo(
|
||||
index=index,
|
||||
name=first_match.group('name').strip(),
|
||||
bus_id=parts[1].strip(),
|
||||
state=next_parts[2].strip() or NA,
|
||||
persistence_mode=_normalize_mode(first_match.group('persistence')),
|
||||
performance_state=second_match.group('performance_state'),
|
||||
memory_total=memory_total,
|
||||
memory_used=memory_used,
|
||||
memory_free=memory_free,
|
||||
gpu_utilization=_percent_to_int(parts[2]),
|
||||
temperature=round(float(second_match.group('temperature'))),
|
||||
power_usage=_watts_to_milliwatts(second_match.group('power_usage')),
|
||||
power_limit=_watts_to_milliwatts(second_match.group('power_limit')),
|
||||
)
|
||||
|
||||
in_process_table = False
|
||||
for line in lines:
|
||||
if '| Process:' in line:
|
||||
in_process_table = True
|
||||
continue
|
||||
if not in_process_table:
|
||||
continue
|
||||
if 'no process found' in line.lower():
|
||||
continue
|
||||
|
||||
process_match = _PROCESS_RE.match(line)
|
||||
if process_match is None:
|
||||
continue
|
||||
|
||||
processes.append(
|
||||
ProcessInfo(
|
||||
gpu_index=int(process_match.group('gpu_index')),
|
||||
pid=int(process_match.group('pid')),
|
||||
name=process_match.group('name').strip() or NA,
|
||||
used_memory=_mib_to_bytes(process_match.group('used_memory')),
|
||||
),
|
||||
)
|
||||
|
||||
return MxSmiSnapshot(
|
||||
devices=devices,
|
||||
processes=processes,
|
||||
driver_version=driver_version,
|
||||
maca_version=maca_version,
|
||||
mxsmi_version=mxsmi_version,
|
||||
)
|
||||
|
||||
|
||||
def _split_table_line(line: str) -> list[str]:
|
||||
if not line.startswith('|'):
|
||||
return []
|
||||
return [part.strip() for part in line.strip().strip('|').split('|')]
|
||||
|
||||
|
||||
def _mib_to_bytes(value: str) -> int | NaType:
|
||||
if value.upper() == 'N/A':
|
||||
return NA
|
||||
return round(float(value) * MiB)
|
||||
|
||||
|
||||
def _watts_to_milliwatts(value: str) -> int:
|
||||
return round(float(value) * 1000)
|
||||
|
||||
|
||||
def _percent_to_int(value: str) -> int | NaType:
|
||||
match = _GPU_UTIL_RE.search(value)
|
||||
if match is None:
|
||||
return NA
|
||||
return round(float(match.group('util')))
|
||||
|
||||
|
||||
def _normalize_mode(value: str) -> str | NaType:
|
||||
normalized = value.strip().lower()
|
||||
if normalized in {'on', 'enable', 'enabled'}:
|
||||
return 'Enabled'
|
||||
if normalized in {'off', 'disable', 'disabled'}:
|
||||
return 'Disabled'
|
||||
return NA
|
||||
|
||||
|
||||
def _normalize_identifier(value: str | bytes | NaType | None) -> str:
|
||||
if isinstance(value, bytes):
|
||||
value = value.decode('utf-8', errors='replace')
|
||||
if value is None or value is NA:
|
||||
return ''
|
||||
return str(value).strip().lower()
|
||||
|
||||
|
||||
def _command_to_string(command: list[str]) -> str:
|
||||
return ' '.join(command)
|
||||
|
|
@ -79,8 +79,15 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
|
|||
if self.device_count == 0:
|
||||
self.height = self.full_height = self.compact_height = 6
|
||||
|
||||
self.backend: str = Device.backend()
|
||||
self.driver_version: str = Device.driver_version()
|
||||
self.cuda_driver_version: str = Device.cuda_driver_version()
|
||||
self.driver_version_label: str = (
|
||||
'KMD Version' if self.backend == 'mx-smi' else 'Driver Version'
|
||||
)
|
||||
self.cuda_driver_version_label: str = (
|
||||
'MACA Version' if self.backend == 'mx-smi' else 'CUDA Driver Version'
|
||||
)
|
||||
|
||||
self._snapshot_buffer: list[Snapshot] = []
|
||||
self._snapshots: list[Snapshot] = []
|
||||
|
|
@ -226,8 +233,8 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
|
|||
|
||||
version_infos = [
|
||||
'NVITOP {}'.format(__version__.partition('+')[0]),
|
||||
f'Driver Version: {self.driver_version}',
|
||||
f'CUDA Driver Version: {self.cuda_driver_version}',
|
||||
f'{self.driver_version_label}: {self.driver_version}',
|
||||
f'{self.cuda_driver_version_label}: {self.cuda_driver_version}',
|
||||
]
|
||||
if sum(len(v) for v in version_infos) % 2 == 0:
|
||||
version_infos[0] += ' '
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue