fix(api/libnvml): fix process info support for NVIDIA R535 driver

This commit is contained in:
Xuehai Pan 2023-07-14 11:24:12 +08:00
parent 04ac6a0efe
commit ecb23a66c3
7 changed files with 352 additions and 271 deletions

View file

@ -25,7 +25,7 @@ repos:
- id: debug-statements
- id: double-quote-string-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.275
rev: v0.0.278
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
@ -34,11 +34,11 @@ repos:
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 23.3.0
rev: 23.7.0
hooks:
- id: black
- repo: https://github.com/asottile/pyupgrade
rev: v3.7.0
rev: v3.9.0
hooks:
- id: pyupgrade
args: [--py37-plus] # sync with requires-python

View file

@ -141,3 +141,5 @@ uptime
ot
oT
mypy
struct
MPS

View file

@ -128,7 +128,15 @@ from typing import (
from nvitop.api import libcuda, libcudart, libnvml
from nvitop.api.process import GpuProcess
from nvitop.api.utils import NA, NaType, Snapshot, boolify, bytes2human, memoize_when_activated
from nvitop.api.utils import (
NA,
UINT_MAX,
NaType,
Snapshot,
boolify,
bytes2human,
memoize_when_activated,
)
if TYPE_CHECKING:
@ -1682,8 +1690,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
pid=p.pid,
device=self,
gpu_memory=gpu_memory,
gpu_instance_id=getattr(p, 'gpuInstanceId', 0xFFFFFFFF),
compute_instance_id=getattr(p, 'computeInstanceId', 0xFFFFFFFF),
gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX),
compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX),
)
proc.type = proc.type + type
@ -2046,9 +2054,9 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
self._gpu_instance_id = libnvml.nvmlQuery(
'nvmlDeviceGetGpuInstanceId',
self.handle,
default=0xFFFFFFFF,
default=UINT_MAX,
)
if self._gpu_instance_id == 0xFFFFFFFF:
if self._gpu_instance_id == UINT_MAX:
self._gpu_instance_id = NA
return self._gpu_instance_id
@ -2062,9 +2070,9 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
self._compute_instance_id = libnvml.nvmlQuery(
'nvmlDeviceGetComputeInstanceId',
self.handle,
default=0xFFFFFFFF,
default=UINT_MAX,
)
if self._compute_instance_id == 0xFFFFFFFF:
if self._compute_instance_id == UINT_MAX:
self._compute_instance_id = NA
return self._compute_instance_id

View file

@ -22,7 +22,6 @@ from __future__ import annotations
import atexit as _atexit
import ctypes as _ctypes
import functools as _functools
import inspect as _inspect
import logging as _logging
import os as _os
@ -42,7 +41,7 @@ import pynvml as _pynvml
from pynvml import * # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import
from pynvml import nvmlDeviceGetPciInfo # appease mypy # noqa: F401 # pylint: disable=unused-import
from nvitop.api.utils import NA
from nvitop.api.utils import NA, UINT_MAX, ULONGLONG_MAX
from nvitop.api.utils import colored as __colored
@ -52,6 +51,8 @@ if _TYPE_CHECKING:
__all__ = [ # will be updated in below
'NA',
'UINT_MAX',
'ULONGLONG_MAX',
'nvmlCheckReturn',
'nvmlQuery',
'nvmlInit',
@ -172,6 +173,8 @@ del (
# 5. Add explicit references to appease linters
# pylint: disable=no-member
c_nvmlDevice_t: _TypeAlias = _pynvml.c_nvmlDevice_t
NVML_SUCCESS: int = _pynvml.NVML_SUCCESS
NVML_ERROR_INSUFFICIENT_SIZE: int = _pynvml.NVML_ERROR_INSUFFICIENT_SIZE
NVMLError_FunctionNotFound: _TypeAlias = _pynvml.NVMLError_FunctionNotFound
NVMLError_GpuIsLost: _TypeAlias = _pynvml.NVMLError_GpuIsLost
NVMLError_InvalidArgument: _TypeAlias = _pynvml.NVMLError_InvalidArgument
@ -456,271 +459,343 @@ def nvmlCheckReturn(
# Patch layers for backward compatibility ##########################################################
__patched_backward_compatibility_layers: bool = False
def __patch_backward_compatibility_layers() -> None:
global __patched_backward_compatibility_layers # pylint: disable=global-statement
if __patched_backward_compatibility_layers:
return
function_name_mapping_lock = _threading.Lock()
function_name_mapping: dict[str, str] = {}
def function_mapping_update(mapping: dict[str, str]) -> dict[str, str]:
with function_name_mapping_lock:
mapping = dict(mapping)
for name, mapped_name in function_name_mapping.items():
if mapped_name in mapping:
mapping[name] = mapping[mapped_name]
function_name_mapping.update(mapping)
return mapping
def with_mapped_function_name() -> None:
def wrapper(
nvmlGetFunctionPointer: _Callable[[str], _ctypes._CFuncPtr], # type: ignore[name-defined]
) -> _Callable[[str], _ctypes._CFuncPtr]: # type: ignore[name-defined]
@_functools.wraps(nvmlGetFunctionPointer)
def wrapped(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined]
mapped_name = function_name_mapping.get(name, name)
return nvmlGetFunctionPointer(mapped_name)
return wrapped
_pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
_nvmlGetFunctionPointer=wrapper(
_pynvml._nvmlGetFunctionPointer, # pylint: disable=protected-access,no-member
),
)
def patch_function_pointers_when_fail(
names: set[str],
callback: _Callable[[str, set[str], Exception, _ModuleType, _ModuleType], str],
) -> _Callable[ # type: ignore[name-defined]
[_Callable[[str], _ctypes._CFuncPtr]],
_Callable[[str], _ctypes._CFuncPtr],
]:
def wrapper(
nvmlGetFunctionPointer: _Callable[[str], _ctypes._CFuncPtr], # type: ignore[name-defined]
) -> _Callable[[str], _ctypes._CFuncPtr]: # type: ignore[name-defined]
@_functools.wraps(nvmlGetFunctionPointer)
def wrapped(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined]
try:
return nvmlGetFunctionPointer(name)
except NVMLError_FunctionNotFound as ex:
if name in names:
new_name = callback(name, names, ex, _pynvml, __modself)
return nvmlGetFunctionPointer(new_name)
raise
return wrapped
return wrapper
def patch_process_info() -> None:
# pylint: disable-next=protected-access,no-member
PrintableStructure = _pynvml._PrintableStructure
# pylint: disable-next=missing-class-docstring,too-few-public-methods
class c_nvmlProcessInfo_v1_t(PrintableStructure): # type: ignore[misc,valid-type]
_fields_: _ClassVar[list[tuple[str, type]]] = [
('pid', _ctypes.c_uint),
('usedGpuMemory', _ctypes.c_ulonglong),
]
_fmt_: _ClassVar[dict[str, str]] = {
'usedGpuMemory': '%d B',
}
# pylint: disable-next=missing-class-docstring,too-few-public-methods
class c_nvmlProcessInfo_v2_t(PrintableStructure): # type: ignore[misc,valid-type]
_fields_: _ClassVar[list[tuple[str, type]]] = [
('pid', _ctypes.c_uint),
('usedGpuMemory', _ctypes.c_ulonglong),
('gpuInstanceId', _ctypes.c_uint),
('computeInstanceId', _ctypes.c_uint),
]
_fmt_: _ClassVar[dict[str, str]] = {
'usedGpuMemory': '%d B',
}
nvmlDeviceGetRunningProcesses_v3_v2 = {
'nvmlDeviceGetComputeRunningProcesses_v3': 'nvmlDeviceGetComputeRunningProcesses_v2',
'nvmlDeviceGetGraphicsRunningProcesses_v3': 'nvmlDeviceGetGraphicsRunningProcesses_v2',
'nvmlDeviceGetMPSComputeRunningProcesses_v3': 'nvmlDeviceGetMPSComputeRunningProcesses_v2',
}
nvmlDeviceGetRunningProcesses_v2_v1 = {
'nvmlDeviceGetComputeRunningProcesses_v2': 'nvmlDeviceGetComputeRunningProcesses',
'nvmlDeviceGetGraphicsRunningProcesses_v2': 'nvmlDeviceGetGraphicsRunningProcesses',
'nvmlDeviceGetMPSComputeRunningProcesses_v2': 'nvmlDeviceGetMPSComputeRunningProcesses',
}
def patch_process_info_callback(
name: str,
names: set[str], # pylint: disable=unused-argument
exception: Exception,
pynvml: _ModuleType,
modself: _ModuleType,
) -> str:
if name in nvmlDeviceGetRunningProcesses_v3_v2:
mapping = nvmlDeviceGetRunningProcesses_v3_v2
struct_type = c_nvmlProcessInfo_v2_t
elif name in nvmlDeviceGetRunningProcesses_v2_v1:
mapping = nvmlDeviceGetRunningProcesses_v2_v1
struct_type = c_nvmlProcessInfo_v1_t
else:
raise exception # no fallbacks for v1 APIs
LOGGER.debug('Patching NVML function pointer `%s`', name)
mapping = function_mapping_update(mapping)
pynvml.__dict__.update(c_nvmlProcessInfo_t=struct_type)
modself.__dict__.update(c_nvmlProcessInfo_t=struct_type)
for old_name, mapped_name in mapping.items():
LOGGER.debug(' Map NVML function `%s` to `%s`', old_name, mapped_name)
LOGGER.debug(
' Patch NVML struct `c_nvmlProcessInfo_t` to `%s`',
struct_type.__name__,
)
return mapping[name]
_pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
# The patching ordering is important: v3 -> v2 -> v1
_nvmlGetFunctionPointer=patch_function_pointers_when_fail(
names=set(nvmlDeviceGetRunningProcesses_v3_v2),
callback=patch_process_info_callback,
)(
patch_function_pointers_when_fail(
names=set(nvmlDeviceGetRunningProcesses_v2_v1),
callback=patch_process_info_callback,
)(
_pynvml._nvmlGetFunctionPointer, # pylint: disable=protected-access,no-member
),
),
)
with_mapped_function_name() # patch first and only for once
patch_process_info()
__patched_backward_compatibility_layers = True
_pynvml_installation_corrupted: bool = not callable(
getattr(_pynvml, '_nvmlGetFunctionPointer', None),
)
# Patch function `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`
if not _pynvml_installation_corrupted:
__patch_backward_compatibility_layers()
del __patch_backward_compatibility_layers
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
_fields_: _ClassVar[list[tuple[str, type]]] = [
('pid', _ctypes.c_uint),
('usedGpuMemory', _ctypes.c_ulonglong),
]
_fmt_: _ClassVar[dict[str, str]] = {
'usedGpuMemory': '%d B',
}
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
_fields_: _ClassVar[list[tuple[str, type]]] = [
('pid', _ctypes.c_uint),
('usedGpuMemory', _ctypes.c_ulonglong),
('gpuInstanceId', _ctypes.c_uint),
('computeInstanceId', _ctypes.c_uint),
]
_fmt_: _ClassVar[dict[str, str]] = {
'usedGpuMemory': '%d B',
}
_pynvml_memory_v2_available: bool = hasattr(_pynvml, 'nvmlMemory_v2')
_pynvml_get_memory_info_v2_available: bool = _pynvml_memory_v2_available
_driver_get_memory_info_v2_available: bool | None = (
None if not _pynvml_installation_corrupted else False
)
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
class c_nvmlProcessInfo_v3_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
_fields_: _ClassVar[list[tuple[str, type]]] = [
('pid', _ctypes.c_uint),
('usedGpuMemory', _ctypes.c_ulonglong),
('gpuInstanceId', _ctypes.c_uint),
('computeInstanceId', _ctypes.c_uint),
('usedGpuCcProtectedMemory', _ctypes.c_ulonglong),
]
_fmt_: _ClassVar[dict[str, str]] = {
'usedGpuMemory': '%d B',
'usedGpuCcProtectedMemory': '%d B',
}
__get_running_process_version_suffix = None
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t
# pylint: disable-next=function-redefined,too-many-branches
def nvmlDeviceGetMemoryInfo(handle: c_nvmlDevice_t) -> _pynvml.c_nvmlMemory_t:
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
def __determine_get_running_process_version_suffix() -> str:
global __get_running_process_version_suffix, c_nvmlProcessInfo_t # pylint: disable=global-statement
Note:
- The version 2 API adds additional memory information. The reserved amount is supported on
version 2 only.
- In MIG mode, if device handle is provided, the API returns aggregate information, only if
the caller has appropriate privileges. Per-instance information can be queried by using
specific MIG device handles.
Raises:
NVMLError_InvalidArgument:
If the library has not been successfully initialized.
NVMLError_NoPermission:
If the user doesn't have permission to perform this operation.
NVMLError_InvalidArgument:
If device is invalid or memory is NULL.
NVMLError_GpuIsLost:
If the target GPU has fallen off the bus or is otherwise inaccessible.
NVMLError_Unknown:
On any unexpected error.
"""
global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement
_lazy_init()
if _driver_get_memory_info_v2_available is None:
try:
if __get_running_process_version_suffix is None:
# pylint: disable-next=protected-access,no-member
_pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
except NVMLError_FunctionNotFound:
with __lock:
_driver_get_memory_info_v2_available = False
_pynvml_get_memory_info_v2_available = False
else:
with __lock:
_driver_get_memory_info_v2_available = True
if _driver_get_memory_info_v2_available:
if _pynvml_memory_v2_available:
# driver ✔ pynvml ?
_nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
__get_running_process_version_suffix = '_v3'
try:
_nvmlGetFunctionPointer('nvmlDeviceGetConfComputeMemSizeInfo')
except NVMLError_FunctionNotFound:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
LOGGER.debug(
'NVML get running process version 3 API with v3 type struct is not available '
'due to incompatible NVIDIA driver. Fallback to use get running process '
'version 3 API with v2 type struct.',
)
try:
# pylint: disable-next=unexpected-keyword-arg,no-member
retval = _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
except TypeError as ex:
if 'unexpected keyword argument' in str(ex).lower():
# driver ✔ pynvml ✘
with __lock:
_pynvml_get_memory_info_v2_available = False
_nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v3')
except NVMLError_FunctionNotFound:
__get_running_process_version_suffix = '_v2'
LOGGER.debug(
'NVML get running process version 3 API with v2 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get running '
'process version 2 API with v2 type struct.',
)
try:
_nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v2')
except NVMLError_FunctionNotFound:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
__get_running_process_version_suffix = ''
LOGGER.debug(
'NVML memory info version 2 is not available '
'due to incompatible `nvidia-ml-py` package.',
'NVML get running process version 2 API with v2 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get '
'running process version 1 API with v1 type struct.',
)
else:
# driver ✔ pynvml ? user ✘
with __lock:
_driver_get_memory_info_v2_available = (
None # unset the flag for user exceptions
)
raise
except (NVMLError_FunctionNotFound, NVMLError_Unknown):
# driver ✔ pynvml ✘
with __lock:
_pynvml_get_memory_info_v2_available = False
LOGGER.debug(
'NVML memory info version 2 is not available '
'due to incompatible NVIDIA driver.',
)
LOGGER.debug(
'NVML get running process version 2 API with v2 type struct is '
'available.',
)
else:
# driver ✔ pynvml ✔
LOGGER.debug('NVML memory info version 2 is available.')
return retval
LOGGER.debug(
'NVML get running process version 3 API with v2 type struct is available.',
)
else:
# driver ✔ pynvml ✘
LOGGER.debug(
'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, but '
'your NVIDIA driver does support the NVML memory info version 2 APIs. NVML '
'memory info version 2 is not available due to the legacy dependencies. '
'Please consider upgrading your `nvidia-ml-py` package by running '
'`pip3 install --upgrade nvitop nvidia-ml-py`.',
'NVML get running process version 3 API with v3 type struct is available.',
)
elif _pynvml_memory_v2_available:
# driver ✘ pynvml ?
LOGGER.debug(
'NVML memory info version 2 is not available due to incompatible NVIDIA driver.',
)
return __get_running_process_version_suffix
def __nvml_device_get_running_processes(
func: str,
handle: c_nvmlDevice_t,
) -> list[c_nvmlProcessInfo_t]:
"""Helper function for :func:`nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`.
Modified from function :func:`pynvml.nvmlDeviceGetComputeRunningProcesses` in package
`nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_.
"""
version_suffix = __determine_get_running_process_version_suffix()
# First call to get the size
c_count = _ctypes.c_uint(0)
# pylint: disable-next=protected-access
fn = _pynvml._nvmlGetFunctionPointer(f'{func}{version_suffix}')
ret = fn(handle, _ctypes.byref(c_count), None)
if ret == NVML_SUCCESS:
# Special case, no running processes
return []
if ret == NVML_ERROR_INSUFFICIENT_SIZE:
# Typical case
# Oversize the array in case more processes are created
c_count.value = c_count.value * 2 + 5
process_array = c_nvmlProcessInfo_t * c_count.value # type: ignore[operator]
c_processes = process_array() # type: ignore[operator]
# Make the call again
ret = fn(handle, _ctypes.byref(c_count), c_processes)
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
processes = []
for i in range(c_count.value):
# Use an alternative struct for this object
obj = _pynvml.nvmlStructToFriendlyObject(c_processes[i])
if obj.usedGpuMemory == ULONGLONG_MAX:
# Special case for WDDM on Windows, see comment above
obj.usedGpuMemory = None
if getattr(obj, 'usedGpuCcProtectedMemory', None) == ULONGLONG_MAX:
obj.usedGpuCcProtectedMemory = None
processes.append(obj)
return processes
# Error case
raise NVMLError(ret)
def nvmlDeviceGetComputeRunningProcesses( # pylint: disable=function-redefined
handle: c_nvmlDevice_t,
) -> list[c_nvmlProcessInfo_t]:
"""Get information about processes with a compute context on a device.
Note:
- In MIG mode, if device handle is provided, the API returns aggregate information, only
if the caller has appropriate privileges. Per-instance information can be queried by
using specific MIG device handles.
Raises:
NVMLError_InvalidArgument:
If the library has not been successfully initialized.
NVMLError_Uninitialized:
If NVML was not first initialized with :func:`nvmlInit`.
NVMLError_NoPermission:
If the user doesn't have permission to perform this operation.
NVMLError_InvalidArgument:
If device is invalid.
NVMLError_GpuIsLost:
If the target GPU has fallen off the bus or is otherwise inaccessible.
NVMLError_Unknown:
On any unexpected error.
"""
return __nvml_device_get_running_processes(
'nvmlDeviceGetComputeRunningProcesses',
handle,
)
def nvmlDeviceGetGraphicsRunningProcesses( # pylint: disable=function-redefined
handle: c_nvmlDevice_t,
) -> list[c_nvmlProcessInfo_t]:
"""Get information about processes with a graphics context on a device.
Note:
- In MIG mode, if device handle is provided, the API returns aggregate information, only
if the caller has appropriate privileges. Per-instance information can be queried by
using specific MIG device handles.
Raises:
NVMLError_InvalidArgument:
If the library has not been successfully initialized.
NVMLError_Uninitialized:
If NVML was not first initialized with :func:`nvmlInit`.
NVMLError_NoPermission:
If the user doesn't have permission to perform this operation.
NVMLError_InvalidArgument:
If device is invalid.
NVMLError_GpuIsLost:
If the target GPU has fallen off the bus or is otherwise inaccessible.
NVMLError_Unknown:
On any unexpected error.
"""
return __nvml_device_get_running_processes(
'nvmlDeviceGetGraphicsRunningProcesses',
handle,
)
def nvmlDeviceGetMPSComputeRunningProcesses( # pylint: disable=function-redefined
handle: c_nvmlDevice_t,
) -> list[c_nvmlProcessInfo_t]:
"""Get information about processes with a MPS compute context on a device.
Note:
- In MIG mode, if device handle is provided, the API returns aggregate information, only
if the caller has appropriate privileges. Per-instance information can be queried by
using specific MIG device handles.
Raises:
NVMLError_InvalidArgument:
If the library has not been successfully initialized.
NVMLError_Uninitialized:
If NVML was not first initialized with :func:`nvmlInit`.
NVMLError_NoPermission:
If the user doesn't have permission to perform this operation.
NVMLError_InvalidArgument:
If device is invalid.
NVMLError_GpuIsLost:
If the target GPU has fallen off the bus or is otherwise inaccessible.
NVMLError_Unknown:
On any unexpected error.
"""
return __nvml_device_get_running_processes(
'nvmlDeviceGetMPSComputeRunningProcesses',
handle,
)
else:
LOGGER.warning(
'Your installed package `nvidia-ml-py` is corrupted. '
'Skip patch functions `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`. '
'You may get incorrect or incomplete results. Please consider reinstall package '
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
)
# Patch function `nvmlDeviceGetMemoryInfo`
if not _pynvml_installation_corrupted:
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
class c_nvmlMemory_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
_fields_: _ClassVar[list[tuple[str, type]]] = [
('total', _pynvml.c_ulonglong),
('free', _pynvml.c_ulonglong),
('used', _pynvml.c_ulonglong),
]
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
class c_nvmlMemory_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
_fields_: _ClassVar[list[tuple[str, type]]] = [
('version', _pynvml.c_uint),
('total', _pynvml.c_ulonglong),
('reserved', _pynvml.c_ulonglong),
('free', _pynvml.c_ulonglong),
('used', _pynvml.c_ulonglong),
]
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | 2 << 24)
__get_memory_info_version_suffix = None
c_nvmlMemory_t = c_nvmlMemory_v2_t
def __determine_get_memory_info_version_suffix() -> str:
global __get_memory_info_version_suffix, c_nvmlMemory_t # pylint: disable=global-statement
if __get_memory_info_version_suffix is None:
# pylint: disable-next=protected-access,no-member
_nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
__get_memory_info_version_suffix = '_v2'
try:
_nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
except NVMLError_FunctionNotFound:
c_nvmlMemory_t = c_nvmlMemory_v1_t
__get_memory_info_version_suffix = ''
LOGGER.debug(
'NVML get memory info version 2 API is not available due to incompatible '
'NVIDIA driver. Fallback to use NVML get memory info version 1 API.',
)
else:
LOGGER.debug('NVML get memory info version 2 is available.')
return __get_memory_info_version_suffix
def nvmlDeviceGetMemoryInfo( # pylint: disable=function-redefined
handle: c_nvmlDevice_t,
) -> c_nvmlMemory_t:
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
Note:
- The version 2 API adds additional memory information. The reserved amount is supported
on version 2 only.
- In MIG mode, if device handle is provided, the API returns aggregate information, only
if the caller has appropriate privileges. Per-instance information can be queried by
using specific MIG device handles.
Raises:
NVMLError_InvalidArgument:
If the library has not been successfully initialized.
NVMLError_Uninitialized:
If NVML was not first initialized with :func:`nvmlInit`.
NVMLError_NoPermission:
If the user doesn't have permission to perform this operation.
NVMLError_InvalidArgument:
If device is invalid.
NVMLError_GpuIsLost:
If the target GPU has fallen off the bus or is otherwise inaccessible.
NVMLError_Unknown:
On any unexpected error.
"""
version_suffix = __determine_get_memory_info_version_suffix()
if version_suffix == '_v2':
c_memory = c_nvmlMemory_v2_t()
c_memory.version = nvmlMemory_v2 # pylint: disable=attribute-defined-outside-init
# pylint: disable-next=protected-access
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
elif version_suffix in {'_v1', ''}:
c_memory = c_nvmlMemory_v1_t()
# pylint: disable-next=protected-access
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo')
else:
# driver ✘ pynvml ✘
LOGGER.debug(
'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, and '
'your NVIDIA driver does not support the NVML memory info version 2 APIs. '
'NVML memory info version 2 is not available.',
raise ValueError(
f'Unknown version suffix {version_suffix!r} for '
'function `nvmlDeviceGetMemoryInfo`.',
)
ret = fn(handle, _ctypes.byref(c_memory))
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
return c_memory
elif _pynvml_get_memory_info_v2_available:
# pylint: disable-next=unexpected-keyword-arg
return _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
return _pynvml.nvmlDeviceGetMemoryInfo(handle)
else:
LOGGER.warning(
'Your installed package `nvidia-ml-py` is corrupted. '
'Skip patch functions `nvmlDeviceGetMemoryInfo`. '
'You may get incorrect or incomplete results. Please consider reinstall package '
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
)
# Add support for lookup fallback and context manager ##############################################

View file

@ -33,6 +33,7 @@ from weakref import WeakValueDictionary
from nvitop.api import host, libnvml
from nvitop.api.utils import (
NA,
UINT_MAX,
NaType,
Snapshot,
bytes2human,
@ -517,9 +518,9 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
self.type = type
if gpu_instance_id is not None and compute_instance_id is not None:
self._gpu_instance_id = gpu_instance_id if gpu_instance_id != 0xFFFFFFFF else NA
self._gpu_instance_id = gpu_instance_id if gpu_instance_id != UINT_MAX else NA
self._compute_instance_id = (
compute_instance_id if compute_instance_id != 0xFFFFFFFF else NA
compute_instance_id if compute_instance_id != UINT_MAX else NA
)
elif device.is_mig_device():
self._gpu_instance_id = device.gpu_instance_id()

View file

@ -20,6 +20,7 @@
from __future__ import annotations
import ctypes
import datetime
import functools
import math
@ -38,6 +39,8 @@ __all__ = [
'NaType',
'NotApplicable',
'NotApplicableType',
'UINT_MAX',
'ULONGLONG_MAX',
'KiB',
'MiB',
'GiB',
@ -479,6 +482,11 @@ NA.__doc__ = """The singleton instance of :class:`NaType`. The actual value is :
NotApplicable = NA
UINT_MAX: int = ctypes.c_uint(-1).value # 0xFFFFFFFF
"""The maximum value of :class:`ctypes.c_uint`."""
ULONGLONG_MAX: int = ctypes.c_ulonglong(-1).value # 0XFFFFFFFFFFFFFFFF
"""The maximum value of :class:`ctypes.c_ulonglong`."""
KiB: int = 1 << 10
"""Kibibyte (1024)"""

View file

@ -320,7 +320,7 @@ def main() -> int:
if len(invalid_indices) > 1:
messages.append(f'ERROR: Invalid device indices: {sorted(invalid_indices)}.')
elif len(invalid_indices) == 1:
messages.append(f'ERROR: Invalid device index: {list(invalid_indices)[0]}.')
messages.append(f'ERROR: Invalid device index: {next(iter(invalid_indices))}.')
elif args.only_visible:
indices = {
index if isinstance(index, int) else index[0]
@ -436,19 +436,6 @@ def main() -> int:
)
messages.append(message)
# pylint: disable-next=protected-access
if libnvml._driver_get_memory_info_v2_available and not libnvml._pynvml_memory_v2_available:
message = '\n'.join(
(
'WARNING: The `nvidia-ml-py` package does not support the NVML memory info version 2 APIs, which would',
'get inaccurate results. Please upgrade it via:',
'',
' pip3 install --upgrade nvitop nvidia-ml-py',
'',
),
)
messages.append(message)
if len(messages) > 0:
for message in messages:
if message.startswith('ERROR:'):