mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-21 06:45:24 -06:00
fix(api/libnvml): fix process info support for NVIDIA R535 driver
This commit is contained in:
parent
04ac6a0efe
commit
ecb23a66c3
7 changed files with 352 additions and 271 deletions
|
|
@ -25,7 +25,7 @@ repos:
|
|||
- id: debug-statements
|
||||
- id: double-quote-string-fixer
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.0.275
|
||||
rev: v0.0.278
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix]
|
||||
|
|
@ -34,11 +34,11 @@ repos:
|
|||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.3.0
|
||||
rev: 23.7.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
rev: v3.7.0
|
||||
rev: v3.9.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py37-plus] # sync with requires-python
|
||||
|
|
|
|||
|
|
@ -141,3 +141,5 @@ uptime
|
|||
ot
|
||||
oT
|
||||
mypy
|
||||
struct
|
||||
MPS
|
||||
|
|
|
|||
|
|
@ -128,7 +128,15 @@ from typing import (
|
|||
|
||||
from nvitop.api import libcuda, libcudart, libnvml
|
||||
from nvitop.api.process import GpuProcess
|
||||
from nvitop.api.utils import NA, NaType, Snapshot, boolify, bytes2human, memoize_when_activated
|
||||
from nvitop.api.utils import (
|
||||
NA,
|
||||
UINT_MAX,
|
||||
NaType,
|
||||
Snapshot,
|
||||
boolify,
|
||||
bytes2human,
|
||||
memoize_when_activated,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
@ -1682,8 +1690,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
pid=p.pid,
|
||||
device=self,
|
||||
gpu_memory=gpu_memory,
|
||||
gpu_instance_id=getattr(p, 'gpuInstanceId', 0xFFFFFFFF),
|
||||
compute_instance_id=getattr(p, 'computeInstanceId', 0xFFFFFFFF),
|
||||
gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX),
|
||||
compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX),
|
||||
)
|
||||
proc.type = proc.type + type
|
||||
|
||||
|
|
@ -2046,9 +2054,9 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
|
|||
self._gpu_instance_id = libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetGpuInstanceId',
|
||||
self.handle,
|
||||
default=0xFFFFFFFF,
|
||||
default=UINT_MAX,
|
||||
)
|
||||
if self._gpu_instance_id == 0xFFFFFFFF:
|
||||
if self._gpu_instance_id == UINT_MAX:
|
||||
self._gpu_instance_id = NA
|
||||
return self._gpu_instance_id
|
||||
|
||||
|
|
@ -2062,9 +2070,9 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
|
|||
self._compute_instance_id = libnvml.nvmlQuery(
|
||||
'nvmlDeviceGetComputeInstanceId',
|
||||
self.handle,
|
||||
default=0xFFFFFFFF,
|
||||
default=UINT_MAX,
|
||||
)
|
||||
if self._compute_instance_id == 0xFFFFFFFF:
|
||||
if self._compute_instance_id == UINT_MAX:
|
||||
self._compute_instance_id = NA
|
||||
return self._compute_instance_id
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@ from __future__ import annotations
|
|||
|
||||
import atexit as _atexit
|
||||
import ctypes as _ctypes
|
||||
import functools as _functools
|
||||
import inspect as _inspect
|
||||
import logging as _logging
|
||||
import os as _os
|
||||
|
|
@ -42,7 +41,7 @@ import pynvml as _pynvml
|
|||
from pynvml import * # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import
|
||||
from pynvml import nvmlDeviceGetPciInfo # appease mypy # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
from nvitop.api.utils import NA
|
||||
from nvitop.api.utils import NA, UINT_MAX, ULONGLONG_MAX
|
||||
from nvitop.api.utils import colored as __colored
|
||||
|
||||
|
||||
|
|
@ -52,6 +51,8 @@ if _TYPE_CHECKING:
|
|||
|
||||
__all__ = [ # will be updated in below
|
||||
'NA',
|
||||
'UINT_MAX',
|
||||
'ULONGLONG_MAX',
|
||||
'nvmlCheckReturn',
|
||||
'nvmlQuery',
|
||||
'nvmlInit',
|
||||
|
|
@ -172,6 +173,8 @@ del (
|
|||
# 5. Add explicit references to appease linters
|
||||
# pylint: disable=no-member
|
||||
c_nvmlDevice_t: _TypeAlias = _pynvml.c_nvmlDevice_t
|
||||
NVML_SUCCESS: int = _pynvml.NVML_SUCCESS
|
||||
NVML_ERROR_INSUFFICIENT_SIZE: int = _pynvml.NVML_ERROR_INSUFFICIENT_SIZE
|
||||
NVMLError_FunctionNotFound: _TypeAlias = _pynvml.NVMLError_FunctionNotFound
|
||||
NVMLError_GpuIsLost: _TypeAlias = _pynvml.NVMLError_GpuIsLost
|
||||
NVMLError_InvalidArgument: _TypeAlias = _pynvml.NVMLError_InvalidArgument
|
||||
|
|
@ -456,271 +459,343 @@ def nvmlCheckReturn(
|
|||
|
||||
|
||||
# Patch layers for backward compatibility ##########################################################
|
||||
__patched_backward_compatibility_layers: bool = False
|
||||
|
||||
|
||||
def __patch_backward_compatibility_layers() -> None:
|
||||
global __patched_backward_compatibility_layers # pylint: disable=global-statement
|
||||
|
||||
if __patched_backward_compatibility_layers:
|
||||
return
|
||||
|
||||
function_name_mapping_lock = _threading.Lock()
|
||||
function_name_mapping: dict[str, str] = {}
|
||||
|
||||
def function_mapping_update(mapping: dict[str, str]) -> dict[str, str]:
|
||||
with function_name_mapping_lock:
|
||||
mapping = dict(mapping)
|
||||
for name, mapped_name in function_name_mapping.items():
|
||||
if mapped_name in mapping:
|
||||
mapping[name] = mapping[mapped_name]
|
||||
function_name_mapping.update(mapping)
|
||||
return mapping
|
||||
|
||||
def with_mapped_function_name() -> None:
|
||||
def wrapper(
|
||||
nvmlGetFunctionPointer: _Callable[[str], _ctypes._CFuncPtr], # type: ignore[name-defined]
|
||||
) -> _Callable[[str], _ctypes._CFuncPtr]: # type: ignore[name-defined]
|
||||
@_functools.wraps(nvmlGetFunctionPointer)
|
||||
def wrapped(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined]
|
||||
mapped_name = function_name_mapping.get(name, name)
|
||||
return nvmlGetFunctionPointer(mapped_name)
|
||||
|
||||
return wrapped
|
||||
|
||||
_pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
|
||||
_nvmlGetFunctionPointer=wrapper(
|
||||
_pynvml._nvmlGetFunctionPointer, # pylint: disable=protected-access,no-member
|
||||
),
|
||||
)
|
||||
|
||||
def patch_function_pointers_when_fail(
|
||||
names: set[str],
|
||||
callback: _Callable[[str, set[str], Exception, _ModuleType, _ModuleType], str],
|
||||
) -> _Callable[ # type: ignore[name-defined]
|
||||
[_Callable[[str], _ctypes._CFuncPtr]],
|
||||
_Callable[[str], _ctypes._CFuncPtr],
|
||||
]:
|
||||
def wrapper(
|
||||
nvmlGetFunctionPointer: _Callable[[str], _ctypes._CFuncPtr], # type: ignore[name-defined]
|
||||
) -> _Callable[[str], _ctypes._CFuncPtr]: # type: ignore[name-defined]
|
||||
@_functools.wraps(nvmlGetFunctionPointer)
|
||||
def wrapped(name: str) -> _ctypes._CFuncPtr: # type: ignore[name-defined]
|
||||
try:
|
||||
return nvmlGetFunctionPointer(name)
|
||||
except NVMLError_FunctionNotFound as ex:
|
||||
if name in names:
|
||||
new_name = callback(name, names, ex, _pynvml, __modself)
|
||||
return nvmlGetFunctionPointer(new_name)
|
||||
raise
|
||||
|
||||
return wrapped
|
||||
|
||||
return wrapper
|
||||
|
||||
def patch_process_info() -> None:
|
||||
# pylint: disable-next=protected-access,no-member
|
||||
PrintableStructure = _pynvml._PrintableStructure
|
||||
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods
|
||||
class c_nvmlProcessInfo_v1_t(PrintableStructure): # type: ignore[misc,valid-type]
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('pid', _ctypes.c_uint),
|
||||
('usedGpuMemory', _ctypes.c_ulonglong),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {
|
||||
'usedGpuMemory': '%d B',
|
||||
}
|
||||
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods
|
||||
class c_nvmlProcessInfo_v2_t(PrintableStructure): # type: ignore[misc,valid-type]
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('pid', _ctypes.c_uint),
|
||||
('usedGpuMemory', _ctypes.c_ulonglong),
|
||||
('gpuInstanceId', _ctypes.c_uint),
|
||||
('computeInstanceId', _ctypes.c_uint),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {
|
||||
'usedGpuMemory': '%d B',
|
||||
}
|
||||
|
||||
nvmlDeviceGetRunningProcesses_v3_v2 = {
|
||||
'nvmlDeviceGetComputeRunningProcesses_v3': 'nvmlDeviceGetComputeRunningProcesses_v2',
|
||||
'nvmlDeviceGetGraphicsRunningProcesses_v3': 'nvmlDeviceGetGraphicsRunningProcesses_v2',
|
||||
'nvmlDeviceGetMPSComputeRunningProcesses_v3': 'nvmlDeviceGetMPSComputeRunningProcesses_v2',
|
||||
}
|
||||
nvmlDeviceGetRunningProcesses_v2_v1 = {
|
||||
'nvmlDeviceGetComputeRunningProcesses_v2': 'nvmlDeviceGetComputeRunningProcesses',
|
||||
'nvmlDeviceGetGraphicsRunningProcesses_v2': 'nvmlDeviceGetGraphicsRunningProcesses',
|
||||
'nvmlDeviceGetMPSComputeRunningProcesses_v2': 'nvmlDeviceGetMPSComputeRunningProcesses',
|
||||
}
|
||||
|
||||
def patch_process_info_callback(
|
||||
name: str,
|
||||
names: set[str], # pylint: disable=unused-argument
|
||||
exception: Exception,
|
||||
pynvml: _ModuleType,
|
||||
modself: _ModuleType,
|
||||
) -> str:
|
||||
if name in nvmlDeviceGetRunningProcesses_v3_v2:
|
||||
mapping = nvmlDeviceGetRunningProcesses_v3_v2
|
||||
struct_type = c_nvmlProcessInfo_v2_t
|
||||
elif name in nvmlDeviceGetRunningProcesses_v2_v1:
|
||||
mapping = nvmlDeviceGetRunningProcesses_v2_v1
|
||||
struct_type = c_nvmlProcessInfo_v1_t
|
||||
else:
|
||||
raise exception # no fallbacks for v1 APIs
|
||||
|
||||
LOGGER.debug('Patching NVML function pointer `%s`', name)
|
||||
mapping = function_mapping_update(mapping)
|
||||
pynvml.__dict__.update(c_nvmlProcessInfo_t=struct_type)
|
||||
modself.__dict__.update(c_nvmlProcessInfo_t=struct_type)
|
||||
|
||||
for old_name, mapped_name in mapping.items():
|
||||
LOGGER.debug(' Map NVML function `%s` to `%s`', old_name, mapped_name)
|
||||
LOGGER.debug(
|
||||
' Patch NVML struct `c_nvmlProcessInfo_t` to `%s`',
|
||||
struct_type.__name__,
|
||||
)
|
||||
return mapping[name]
|
||||
|
||||
_pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
|
||||
# The patching ordering is important: v3 -> v2 -> v1
|
||||
_nvmlGetFunctionPointer=patch_function_pointers_when_fail(
|
||||
names=set(nvmlDeviceGetRunningProcesses_v3_v2),
|
||||
callback=patch_process_info_callback,
|
||||
)(
|
||||
patch_function_pointers_when_fail(
|
||||
names=set(nvmlDeviceGetRunningProcesses_v2_v1),
|
||||
callback=patch_process_info_callback,
|
||||
)(
|
||||
_pynvml._nvmlGetFunctionPointer, # pylint: disable=protected-access,no-member
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
with_mapped_function_name() # patch first and only for once
|
||||
patch_process_info()
|
||||
|
||||
__patched_backward_compatibility_layers = True
|
||||
|
||||
|
||||
_pynvml_installation_corrupted: bool = not callable(
|
||||
getattr(_pynvml, '_nvmlGetFunctionPointer', None),
|
||||
)
|
||||
|
||||
# Patch function `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`
|
||||
if not _pynvml_installation_corrupted:
|
||||
__patch_backward_compatibility_layers()
|
||||
del __patch_backward_compatibility_layers
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
|
||||
class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('pid', _ctypes.c_uint),
|
||||
('usedGpuMemory', _ctypes.c_ulonglong),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {
|
||||
'usedGpuMemory': '%d B',
|
||||
}
|
||||
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
|
||||
class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('pid', _ctypes.c_uint),
|
||||
('usedGpuMemory', _ctypes.c_ulonglong),
|
||||
('gpuInstanceId', _ctypes.c_uint),
|
||||
('computeInstanceId', _ctypes.c_uint),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {
|
||||
'usedGpuMemory': '%d B',
|
||||
}
|
||||
|
||||
_pynvml_memory_v2_available: bool = hasattr(_pynvml, 'nvmlMemory_v2')
|
||||
_pynvml_get_memory_info_v2_available: bool = _pynvml_memory_v2_available
|
||||
_driver_get_memory_info_v2_available: bool | None = (
|
||||
None if not _pynvml_installation_corrupted else False
|
||||
)
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
|
||||
class c_nvmlProcessInfo_v3_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('pid', _ctypes.c_uint),
|
||||
('usedGpuMemory', _ctypes.c_ulonglong),
|
||||
('gpuInstanceId', _ctypes.c_uint),
|
||||
('computeInstanceId', _ctypes.c_uint),
|
||||
('usedGpuCcProtectedMemory', _ctypes.c_ulonglong),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {
|
||||
'usedGpuMemory': '%d B',
|
||||
'usedGpuCcProtectedMemory': '%d B',
|
||||
}
|
||||
|
||||
__get_running_process_version_suffix = None
|
||||
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t
|
||||
|
||||
# pylint: disable-next=function-redefined,too-many-branches
|
||||
def nvmlDeviceGetMemoryInfo(handle: c_nvmlDevice_t) -> _pynvml.c_nvmlMemory_t:
|
||||
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
|
||||
def __determine_get_running_process_version_suffix() -> str:
|
||||
global __get_running_process_version_suffix, c_nvmlProcessInfo_t # pylint: disable=global-statement
|
||||
|
||||
Note:
|
||||
- The version 2 API adds additional memory information. The reserved amount is supported on
|
||||
version 2 only.
|
||||
- In MIG mode, if device handle is provided, the API returns aggregate information, only if
|
||||
the caller has appropriate privileges. Per-instance information can be queried by using
|
||||
specific MIG device handles.
|
||||
|
||||
Raises:
|
||||
NVMLError_InvalidArgument:
|
||||
If the library has not been successfully initialized.
|
||||
NVMLError_NoPermission:
|
||||
If the user doesn't have permission to perform this operation.
|
||||
NVMLError_InvalidArgument:
|
||||
If device is invalid or memory is NULL.
|
||||
NVMLError_GpuIsLost:
|
||||
If the target GPU has fallen off the bus or is otherwise inaccessible.
|
||||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement
|
||||
|
||||
_lazy_init()
|
||||
|
||||
if _driver_get_memory_info_v2_available is None:
|
||||
try:
|
||||
if __get_running_process_version_suffix is None:
|
||||
# pylint: disable-next=protected-access,no-member
|
||||
_pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
|
||||
except NVMLError_FunctionNotFound:
|
||||
with __lock:
|
||||
_driver_get_memory_info_v2_available = False
|
||||
_pynvml_get_memory_info_v2_available = False
|
||||
else:
|
||||
with __lock:
|
||||
_driver_get_memory_info_v2_available = True
|
||||
|
||||
if _driver_get_memory_info_v2_available:
|
||||
if _pynvml_memory_v2_available:
|
||||
# driver ✔ pynvml ?
|
||||
_nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
|
||||
__get_running_process_version_suffix = '_v3'
|
||||
try:
|
||||
_nvmlGetFunctionPointer('nvmlDeviceGetConfComputeMemSizeInfo')
|
||||
except NVMLError_FunctionNotFound:
|
||||
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
|
||||
LOGGER.debug(
|
||||
'NVML get running process version 3 API with v3 type struct is not available '
|
||||
'due to incompatible NVIDIA driver. Fallback to use get running process '
|
||||
'version 3 API with v2 type struct.',
|
||||
)
|
||||
try:
|
||||
# pylint: disable-next=unexpected-keyword-arg,no-member
|
||||
retval = _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
|
||||
except TypeError as ex:
|
||||
if 'unexpected keyword argument' in str(ex).lower():
|
||||
# driver ✔ pynvml ✘
|
||||
with __lock:
|
||||
_pynvml_get_memory_info_v2_available = False
|
||||
_nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v3')
|
||||
except NVMLError_FunctionNotFound:
|
||||
__get_running_process_version_suffix = '_v2'
|
||||
LOGGER.debug(
|
||||
'NVML get running process version 3 API with v2 type struct is not '
|
||||
'available due to incompatible NVIDIA driver. Fallback to use get running '
|
||||
'process version 2 API with v2 type struct.',
|
||||
)
|
||||
try:
|
||||
_nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v2')
|
||||
except NVMLError_FunctionNotFound:
|
||||
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
|
||||
__get_running_process_version_suffix = ''
|
||||
LOGGER.debug(
|
||||
'NVML memory info version 2 is not available '
|
||||
'due to incompatible `nvidia-ml-py` package.',
|
||||
'NVML get running process version 2 API with v2 type struct is not '
|
||||
'available due to incompatible NVIDIA driver. Fallback to use get '
|
||||
'running process version 1 API with v1 type struct.',
|
||||
)
|
||||
else:
|
||||
# driver ✔ pynvml ? user ✘
|
||||
with __lock:
|
||||
_driver_get_memory_info_v2_available = (
|
||||
None # unset the flag for user exceptions
|
||||
)
|
||||
raise
|
||||
except (NVMLError_FunctionNotFound, NVMLError_Unknown):
|
||||
# driver ✔ pynvml ✘
|
||||
with __lock:
|
||||
_pynvml_get_memory_info_v2_available = False
|
||||
LOGGER.debug(
|
||||
'NVML memory info version 2 is not available '
|
||||
'due to incompatible NVIDIA driver.',
|
||||
)
|
||||
LOGGER.debug(
|
||||
'NVML get running process version 2 API with v2 type struct is '
|
||||
'available.',
|
||||
)
|
||||
else:
|
||||
# driver ✔ pynvml ✔
|
||||
LOGGER.debug('NVML memory info version 2 is available.')
|
||||
return retval
|
||||
LOGGER.debug(
|
||||
'NVML get running process version 3 API with v2 type struct is available.',
|
||||
)
|
||||
else:
|
||||
# driver ✔ pynvml ✘
|
||||
LOGGER.debug(
|
||||
'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, but '
|
||||
'your NVIDIA driver does support the NVML memory info version 2 APIs. NVML '
|
||||
'memory info version 2 is not available due to the legacy dependencies. '
|
||||
'Please consider upgrading your `nvidia-ml-py` package by running '
|
||||
'`pip3 install --upgrade nvitop nvidia-ml-py`.',
|
||||
'NVML get running process version 3 API with v3 type struct is available.',
|
||||
)
|
||||
elif _pynvml_memory_v2_available:
|
||||
# driver ✘ pynvml ?
|
||||
LOGGER.debug(
|
||||
'NVML memory info version 2 is not available due to incompatible NVIDIA driver.',
|
||||
)
|
||||
|
||||
return __get_running_process_version_suffix
|
||||
|
||||
def __nvml_device_get_running_processes(
|
||||
func: str,
|
||||
handle: c_nvmlDevice_t,
|
||||
) -> list[c_nvmlProcessInfo_t]:
|
||||
"""Helper function for :func:`nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`.
|
||||
|
||||
Modified from function :func:`pynvml.nvmlDeviceGetComputeRunningProcesses` in package
|
||||
`nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_.
|
||||
"""
|
||||
version_suffix = __determine_get_running_process_version_suffix()
|
||||
|
||||
# First call to get the size
|
||||
c_count = _ctypes.c_uint(0)
|
||||
# pylint: disable-next=protected-access
|
||||
fn = _pynvml._nvmlGetFunctionPointer(f'{func}{version_suffix}')
|
||||
ret = fn(handle, _ctypes.byref(c_count), None)
|
||||
|
||||
if ret == NVML_SUCCESS:
|
||||
# Special case, no running processes
|
||||
return []
|
||||
if ret == NVML_ERROR_INSUFFICIENT_SIZE:
|
||||
# Typical case
|
||||
# Oversize the array in case more processes are created
|
||||
c_count.value = c_count.value * 2 + 5
|
||||
process_array = c_nvmlProcessInfo_t * c_count.value # type: ignore[operator]
|
||||
c_processes = process_array() # type: ignore[operator]
|
||||
|
||||
# Make the call again
|
||||
ret = fn(handle, _ctypes.byref(c_count), c_processes)
|
||||
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
|
||||
|
||||
processes = []
|
||||
for i in range(c_count.value):
|
||||
# Use an alternative struct for this object
|
||||
obj = _pynvml.nvmlStructToFriendlyObject(c_processes[i])
|
||||
if obj.usedGpuMemory == ULONGLONG_MAX:
|
||||
# Special case for WDDM on Windows, see comment above
|
||||
obj.usedGpuMemory = None
|
||||
if getattr(obj, 'usedGpuCcProtectedMemory', None) == ULONGLONG_MAX:
|
||||
obj.usedGpuCcProtectedMemory = None
|
||||
processes.append(obj)
|
||||
|
||||
return processes
|
||||
|
||||
# Error case
|
||||
raise NVMLError(ret)
|
||||
|
||||
def nvmlDeviceGetComputeRunningProcesses( # pylint: disable=function-redefined
|
||||
handle: c_nvmlDevice_t,
|
||||
) -> list[c_nvmlProcessInfo_t]:
|
||||
"""Get information about processes with a compute context on a device.
|
||||
|
||||
Note:
|
||||
- In MIG mode, if device handle is provided, the API returns aggregate information, only
|
||||
if the caller has appropriate privileges. Per-instance information can be queried by
|
||||
using specific MIG device handles.
|
||||
|
||||
Raises:
|
||||
NVMLError_InvalidArgument:
|
||||
If the library has not been successfully initialized.
|
||||
NVMLError_Uninitialized:
|
||||
If NVML was not first initialized with :func:`nvmlInit`.
|
||||
NVMLError_NoPermission:
|
||||
If the user doesn't have permission to perform this operation.
|
||||
NVMLError_InvalidArgument:
|
||||
If device is invalid.
|
||||
NVMLError_GpuIsLost:
|
||||
If the target GPU has fallen off the bus or is otherwise inaccessible.
|
||||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
return __nvml_device_get_running_processes(
|
||||
'nvmlDeviceGetComputeRunningProcesses',
|
||||
handle,
|
||||
)
|
||||
|
||||
def nvmlDeviceGetGraphicsRunningProcesses( # pylint: disable=function-redefined
|
||||
handle: c_nvmlDevice_t,
|
||||
) -> list[c_nvmlProcessInfo_t]:
|
||||
"""Get information about processes with a graphics context on a device.
|
||||
|
||||
Note:
|
||||
- In MIG mode, if device handle is provided, the API returns aggregate information, only
|
||||
if the caller has appropriate privileges. Per-instance information can be queried by
|
||||
using specific MIG device handles.
|
||||
|
||||
Raises:
|
||||
NVMLError_InvalidArgument:
|
||||
If the library has not been successfully initialized.
|
||||
NVMLError_Uninitialized:
|
||||
If NVML was not first initialized with :func:`nvmlInit`.
|
||||
NVMLError_NoPermission:
|
||||
If the user doesn't have permission to perform this operation.
|
||||
NVMLError_InvalidArgument:
|
||||
If device is invalid.
|
||||
NVMLError_GpuIsLost:
|
||||
If the target GPU has fallen off the bus or is otherwise inaccessible.
|
||||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
return __nvml_device_get_running_processes(
|
||||
'nvmlDeviceGetGraphicsRunningProcesses',
|
||||
handle,
|
||||
)
|
||||
|
||||
def nvmlDeviceGetMPSComputeRunningProcesses( # pylint: disable=function-redefined
|
||||
handle: c_nvmlDevice_t,
|
||||
) -> list[c_nvmlProcessInfo_t]:
|
||||
"""Get information about processes with a MPS compute context on a device.
|
||||
|
||||
Note:
|
||||
- In MIG mode, if device handle is provided, the API returns aggregate information, only
|
||||
if the caller has appropriate privileges. Per-instance information can be queried by
|
||||
using specific MIG device handles.
|
||||
|
||||
Raises:
|
||||
NVMLError_InvalidArgument:
|
||||
If the library has not been successfully initialized.
|
||||
NVMLError_Uninitialized:
|
||||
If NVML was not first initialized with :func:`nvmlInit`.
|
||||
NVMLError_NoPermission:
|
||||
If the user doesn't have permission to perform this operation.
|
||||
NVMLError_InvalidArgument:
|
||||
If device is invalid.
|
||||
NVMLError_GpuIsLost:
|
||||
If the target GPU has fallen off the bus or is otherwise inaccessible.
|
||||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
return __nvml_device_get_running_processes(
|
||||
'nvmlDeviceGetMPSComputeRunningProcesses',
|
||||
handle,
|
||||
)
|
||||
|
||||
else:
|
||||
LOGGER.warning(
|
||||
'Your installed package `nvidia-ml-py` is corrupted. '
|
||||
'Skip patch functions `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`. '
|
||||
'You may get incorrect or incomplete results. Please consider reinstall package '
|
||||
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
|
||||
)
|
||||
|
||||
# Patch function `nvmlDeviceGetMemoryInfo`
|
||||
if not _pynvml_installation_corrupted:
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
|
||||
class c_nvmlMemory_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('total', _pynvml.c_ulonglong),
|
||||
('free', _pynvml.c_ulonglong),
|
||||
('used', _pynvml.c_ulonglong),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
|
||||
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
|
||||
class c_nvmlMemory_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
|
||||
_fields_: _ClassVar[list[tuple[str, type]]] = [
|
||||
('version', _pynvml.c_uint),
|
||||
('total', _pynvml.c_ulonglong),
|
||||
('reserved', _pynvml.c_ulonglong),
|
||||
('free', _pynvml.c_ulonglong),
|
||||
('used', _pynvml.c_ulonglong),
|
||||
]
|
||||
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
|
||||
|
||||
nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | 2 << 24)
|
||||
__get_memory_info_version_suffix = None
|
||||
c_nvmlMemory_t = c_nvmlMemory_v2_t
|
||||
|
||||
def __determine_get_memory_info_version_suffix() -> str:
|
||||
global __get_memory_info_version_suffix, c_nvmlMemory_t # pylint: disable=global-statement
|
||||
|
||||
if __get_memory_info_version_suffix is None:
|
||||
# pylint: disable-next=protected-access,no-member
|
||||
_nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
|
||||
__get_memory_info_version_suffix = '_v2'
|
||||
try:
|
||||
_nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
|
||||
except NVMLError_FunctionNotFound:
|
||||
c_nvmlMemory_t = c_nvmlMemory_v1_t
|
||||
__get_memory_info_version_suffix = ''
|
||||
LOGGER.debug(
|
||||
'NVML get memory info version 2 API is not available due to incompatible '
|
||||
'NVIDIA driver. Fallback to use NVML get memory info version 1 API.',
|
||||
)
|
||||
else:
|
||||
LOGGER.debug('NVML get memory info version 2 is available.')
|
||||
|
||||
return __get_memory_info_version_suffix
|
||||
|
||||
def nvmlDeviceGetMemoryInfo( # pylint: disable=function-redefined
|
||||
handle: c_nvmlDevice_t,
|
||||
) -> c_nvmlMemory_t:
|
||||
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
|
||||
|
||||
Note:
|
||||
- The version 2 API adds additional memory information. The reserved amount is supported
|
||||
on version 2 only.
|
||||
- In MIG mode, if device handle is provided, the API returns aggregate information, only
|
||||
if the caller has appropriate privileges. Per-instance information can be queried by
|
||||
using specific MIG device handles.
|
||||
|
||||
Raises:
|
||||
NVMLError_InvalidArgument:
|
||||
If the library has not been successfully initialized.
|
||||
NVMLError_Uninitialized:
|
||||
If NVML was not first initialized with :func:`nvmlInit`.
|
||||
NVMLError_NoPermission:
|
||||
If the user doesn't have permission to perform this operation.
|
||||
NVMLError_InvalidArgument:
|
||||
If device is invalid.
|
||||
NVMLError_GpuIsLost:
|
||||
If the target GPU has fallen off the bus or is otherwise inaccessible.
|
||||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
version_suffix = __determine_get_memory_info_version_suffix()
|
||||
if version_suffix == '_v2':
|
||||
c_memory = c_nvmlMemory_v2_t()
|
||||
c_memory.version = nvmlMemory_v2 # pylint: disable=attribute-defined-outside-init
|
||||
# pylint: disable-next=protected-access
|
||||
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
|
||||
elif version_suffix in {'_v1', ''}:
|
||||
c_memory = c_nvmlMemory_v1_t()
|
||||
# pylint: disable-next=protected-access
|
||||
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo')
|
||||
else:
|
||||
# driver ✘ pynvml ✘
|
||||
LOGGER.debug(
|
||||
'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, and '
|
||||
'your NVIDIA driver does not support the NVML memory info version 2 APIs. '
|
||||
'NVML memory info version 2 is not available.',
|
||||
raise ValueError(
|
||||
f'Unknown version suffix {version_suffix!r} for '
|
||||
'function `nvmlDeviceGetMemoryInfo`.',
|
||||
)
|
||||
ret = fn(handle, _ctypes.byref(c_memory))
|
||||
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
|
||||
return c_memory
|
||||
|
||||
elif _pynvml_get_memory_info_v2_available:
|
||||
# pylint: disable-next=unexpected-keyword-arg
|
||||
return _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
|
||||
|
||||
return _pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
else:
|
||||
LOGGER.warning(
|
||||
'Your installed package `nvidia-ml-py` is corrupted. '
|
||||
'Skip patch functions `nvmlDeviceGetMemoryInfo`. '
|
||||
'You may get incorrect or incomplete results. Please consider reinstall package '
|
||||
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
|
||||
)
|
||||
|
||||
|
||||
# Add support for lookup fallback and context manager ##############################################
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ from weakref import WeakValueDictionary
|
|||
from nvitop.api import host, libnvml
|
||||
from nvitop.api.utils import (
|
||||
NA,
|
||||
UINT_MAX,
|
||||
NaType,
|
||||
Snapshot,
|
||||
bytes2human,
|
||||
|
|
@ -517,9 +518,9 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
self.type = type
|
||||
|
||||
if gpu_instance_id is not None and compute_instance_id is not None:
|
||||
self._gpu_instance_id = gpu_instance_id if gpu_instance_id != 0xFFFFFFFF else NA
|
||||
self._gpu_instance_id = gpu_instance_id if gpu_instance_id != UINT_MAX else NA
|
||||
self._compute_instance_id = (
|
||||
compute_instance_id if compute_instance_id != 0xFFFFFFFF else NA
|
||||
compute_instance_id if compute_instance_id != UINT_MAX else NA
|
||||
)
|
||||
elif device.is_mig_device():
|
||||
self._gpu_instance_id = device.gpu_instance_id()
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import datetime
|
||||
import functools
|
||||
import math
|
||||
|
|
@ -38,6 +39,8 @@ __all__ = [
|
|||
'NaType',
|
||||
'NotApplicable',
|
||||
'NotApplicableType',
|
||||
'UINT_MAX',
|
||||
'ULONGLONG_MAX',
|
||||
'KiB',
|
||||
'MiB',
|
||||
'GiB',
|
||||
|
|
@ -479,6 +482,11 @@ NA.__doc__ = """The singleton instance of :class:`NaType`. The actual value is :
|
|||
|
||||
NotApplicable = NA
|
||||
|
||||
UINT_MAX: int = ctypes.c_uint(-1).value # 0xFFFFFFFF
|
||||
"""The maximum value of :class:`ctypes.c_uint`."""
|
||||
ULONGLONG_MAX: int = ctypes.c_ulonglong(-1).value # 0XFFFFFFFFFFFFFFFF
|
||||
"""The maximum value of :class:`ctypes.c_ulonglong`."""
|
||||
|
||||
KiB: int = 1 << 10
|
||||
"""Kibibyte (1024)"""
|
||||
|
||||
|
|
|
|||
|
|
@ -320,7 +320,7 @@ def main() -> int:
|
|||
if len(invalid_indices) > 1:
|
||||
messages.append(f'ERROR: Invalid device indices: {sorted(invalid_indices)}.')
|
||||
elif len(invalid_indices) == 1:
|
||||
messages.append(f'ERROR: Invalid device index: {list(invalid_indices)[0]}.')
|
||||
messages.append(f'ERROR: Invalid device index: {next(iter(invalid_indices))}.')
|
||||
elif args.only_visible:
|
||||
indices = {
|
||||
index if isinstance(index, int) else index[0]
|
||||
|
|
@ -436,19 +436,6 @@ def main() -> int:
|
|||
)
|
||||
messages.append(message)
|
||||
|
||||
# pylint: disable-next=protected-access
|
||||
if libnvml._driver_get_memory_info_v2_available and not libnvml._pynvml_memory_v2_available:
|
||||
message = '\n'.join(
|
||||
(
|
||||
'WARNING: The `nvidia-ml-py` package does not support the NVML memory info version 2 APIs, which would',
|
||||
'get inaccurate results. Please upgrade it via:',
|
||||
'',
|
||||
' pip3 install --upgrade nvitop nvidia-ml-py',
|
||||
'',
|
||||
),
|
||||
)
|
||||
messages.append(message)
|
||||
|
||||
if len(messages) > 0:
|
||||
for message in messages:
|
||||
if message.startswith('ERROR:'):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue