refactor(core/libnvml): rename nvml to libnvml

Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
Xuehai Pan 2022-07-16 17:07:16 +08:00 committed by Xuehai Pan
parent 6bf151081a
commit 1287159fb3
8 changed files with 128 additions and 126 deletions

View file

@ -10,7 +10,7 @@ from typing import Dict, List, Tuple, Union
from tensorflow.python.keras.callbacks import Callback # pylint: disable=import-error,no-name-in-module
from nvitop.core import nvml
from nvitop.core import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
@ -89,8 +89,8 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
super().__init__()
try:
nvml.nvmlInit()
except nvml.NVMLError as ex:
libnvml.nvmlInit()
except libnvml.NVMLError as ex:
raise ValueError(
'Cannot use the GpuStatsLogger callback because the NVIDIA driver is not installed.'
) from ex
@ -106,7 +106,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
try:
self._devices = get_devices_by_logical_ids(gpu_ids, unique=True)
except (nvml.NVMLError, RuntimeError) as ex:
except (libnvml.NVMLError, RuntimeError) as ex:
raise ValueError(
'Cannot use GpuStatsLogger callback because devices unavailable. '
'Received: `gpus={}`'.format(gpu_ids)

View file

@ -11,7 +11,7 @@ from pytorch_lightning.callbacks import Callback #
from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error
from pytorch_lightning.utilities.exceptions import MisconfigurationException # pylint: disable=import-error
from nvitop.core import nvml
from nvitop.core import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
@ -77,8 +77,8 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
super().__init__()
try:
nvml.nvmlInit()
except nvml.NVMLError as ex:
libnvml.nvmlInit()
except libnvml.NVMLError as ex:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.'
) from ex
@ -103,7 +103,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
device_ids = trainer.data_parallel_device_ids
try:
self._devices = get_devices_by_logical_ids(device_ids, unique=True)
except (nvml.NVMLError, RuntimeError) as ex:
except (libnvml.NVMLError, RuntimeError) as ex:
raise ValueError(
'Cannot use GpuStatsLogger callback because devices unavailable. '
'Received: `gpus={}`'.format(device_ids)

View file

@ -8,7 +8,7 @@ import curses
import os
import sys
from nvitop.core import nvml, HostProcess, boolify
from nvitop.core import libnvml, HostProcess, boolify
from nvitop.gui import Top, Device, libcurses, setlocale_utf8, colored, set_color, USERNAME
from nvitop.version import __version__
@ -140,9 +140,9 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo
try:
device_count = Device.count()
except nvml.NVMLError_LibraryNotFound: # pylint: disable=no-member
except libnvml.NVMLError_LibraryNotFound: # pylint: disable=no-member
return 1
except nvml.NVMLError as e: # pylint: disable=invalid-name
except libnvml.NVMLError as e: # pylint: disable=invalid-name
print('{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), e), file=sys.stderr)
return 1
@ -207,14 +207,14 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo
top.print()
top.destroy()
if len(nvml.UNKNOWN_FUNCTIONS) > 0:
if len(libnvml.UNKNOWN_FUNCTIONS) > 0:
unknown_function_messages = [
'ERROR: Some FunctionNotFound errors occurred while calling:'
if len(nvml.UNKNOWN_FUNCTIONS) > 1
if len(libnvml.UNKNOWN_FUNCTIONS) > 1
else 'ERROR: A FunctionNotFound error occurred while calling:'
]
unknown_function_messages.extend(' nvmlQuery({.__name__!r}, *args, **kwargs)'.format(func)
for func, _ in nvml.UNKNOWN_FUNCTIONS.values())
for func, _ in libnvml.UNKNOWN_FUNCTIONS.values())
unknown_function_messages.append('\n'.join((
'Please verify whether the `{0}` package is compatible with your NVIDIA driver version.',
'You can check the release history of `{0}` and install the compatible version manually.',

View file

@ -4,7 +4,7 @@
"""The core APIs of nvitop."""
from nvitop.core import host, utils
from nvitop.core.libnvml import nvml, nvmlCheckReturn, NVMLError
from nvitop.core.libnvml import libnvml, nvmlCheckReturn, NVMLError
from nvitop.core.device import Device, PhysicalDevice, MigDevice, CudaDevice, CudaMigDevice
from nvitop.core.process import HostProcess, GpuProcess, command_join
from nvitop.core.collector import take_snapshots, ResourceMetricCollector
@ -12,7 +12,7 @@ from nvitop.core.utils import *
__all__ = ['take_snapshots', 'ResourceMetricCollector',
'nvml', 'nvmlCheckReturn', 'NVMLError',
'libnvml', 'nvmlCheckReturn', 'NVMLError',
'Device', 'PhysicalDevice', 'MigDevice', 'CudaDevice', 'CudaMigDevice',
'host', 'HostProcess', 'GpuProcess', 'command_join']
__all__.extend(utils.__all__)

View file

@ -97,7 +97,7 @@ from typing import List, Tuple, Dict, Iterable, NamedTuple, Callable, Union, Opt
from cachetools.func import ttl_cache
from nvitop.core.libnvml import nvml
from nvitop.core.libnvml import libnvml
from nvitop.core.process import GpuProcess
from nvitop.core.utils import (NA, NaType, Snapshot, bytes2human,
boolify, memoize_when_activated)
@ -132,7 +132,7 @@ def _does_any_device_support_mig_mode() -> bool:
global _ANY_DEVICE_SUPPORTS_MIG_MODE # pylint: disable=global-statement
if _ANY_DEVICE_SUPPORTS_MIG_MODE is None:
_ANY_DEVICE_SUPPORTS_MIG_MODE = any(nvml.nvmlCheckReturn(device.mig_mode())
_ANY_DEVICE_SUPPORTS_MIG_MODE = any(libnvml.nvmlCheckReturn(device.mig_mode())
for device in PhysicalDevice.all())
return _ANY_DEVICE_SUPPORTS_MIG_MODE
@ -228,7 +228,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=driver_version
"""
return nvml.nvmlQuery('nvmlSystemGetDriverVersion')
return libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
@staticmethod
def cuda_version() -> Union[str, NaType]:
@ -236,8 +236,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
This can be different from the version of the CUDA runtime.
"""
cuda_version = nvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
if nvml.nvmlCheckReturn(cuda_version, int):
cuda_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
if libnvml.nvmlCheckReturn(cuda_version, int):
major = cuda_version // 1000
minor = (cuda_version % 1000) // 10
revision = cuda_version % 10
@ -257,7 +257,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=count
"""
return nvml.nvmlQuery('nvmlDeviceGetCount', default=0)
return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
@classmethod
def all(cls) -> List['PhysicalDevice']:
@ -406,7 +406,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
if index_or_uuid.isdigit():
index_or_uuid = int(index_or_uuid)
elif Device.UUID_PATTERN.match(index_or_uuid) is None:
raise nvml.NVMLError_NotFound() # pylint: disable=no-member
raise libnvml.NVMLError_NotFound # pylint: disable=no-member
if use_integer_identifiers is None:
use_integer_identifiers = isinstance(index_or_uuid, int)
@ -426,7 +426,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
try:
device = from_index_or_uuid(identifier)
except (ValueError, nvml.NVMLError):
except (ValueError, libnvml.NVMLError):
break
devices.append(device)
@ -510,9 +510,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""Initializes the instance created by ``__new__()``.
Raises:
nvml.NVMLError_NotFound:
libnvml.NVMLError_NotFound:
If the device is not found for the given NVML identifier.
nvml.NVMLError_InvalidArgument:
libnvml.NVMLError_InvalidArgument:
If the device index is out of range.
"""
@ -533,20 +533,20 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
if index is not None:
self._nvml_index = index
try:
self._handle = nvml.nvmlQuery('nvmlDeviceGetHandleByIndex', index, ignore_errors=False)
except nvml.NVMLError_GpuIsLost: # pylint: disable=no-member
self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByIndex', index, ignore_errors=False)
except libnvml.NVMLError_GpuIsLost: # pylint: disable=no-member
self._handle = None
else:
try:
if uuid is not None:
self._handle = nvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False)
self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False)
else:
self._handle = nvml.nvmlQuery('nvmlDeviceGetHandleByPciBusId', bus_id, ignore_errors=False)
except nvml.NVMLError_GpuIsLost: # pylint: disable=no-member
self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByPciBusId', bus_id, ignore_errors=False)
except libnvml.NVMLError_GpuIsLost: # pylint: disable=no-member
self._handle = None
self._nvml_index = NA
else:
self._nvml_index = nvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
self._timestamp = 0
@ -608,7 +608,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
if self._handle is None:
return lambda: NA
match = nvml.VERSIONED_PATTERN.match(name)
match = libnvml.VERSIONED_PATTERN.match(name)
if match is not None:
name = match.group('name')
suffix = match.group('suffix')
@ -617,16 +617,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
try:
pascal_case = name.title().replace('_', '')
func = getattr(nvml, 'nvmlDeviceGet' + pascal_case + suffix)
func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix)
except AttributeError:
pascal_case = ''.join(part[:1].upper() + part[1:] for part in filter(None, name.split('_')))
func = getattr(nvml, 'nvmlDeviceGet' + pascal_case + suffix)
func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix)
@ttl_cache(ttl=1.0)
def attribute(*args, **kwargs):
try:
return nvml.nvmlQuery(func, self._handle, *args, **kwargs, ignore_errors=False)
except nvml.NVMLError_NotSupported: # pylint: disable=no-member
return libnvml.nvmlQuery(func, self._handle, *args, **kwargs, ignore_errors=False)
except libnvml.NVMLError_NotSupported: # pylint: disable=no-member
return NA
attribute.__name__ = name
@ -669,7 +669,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
return self._nvml_index # will be overridden in MigDevice
@property
def handle(self) -> nvml.c_nvmlDevice_t:
def handle(self) -> libnvml.c_nvmlDevice_t:
"""The NVML device handle."""
return self._handle
@ -712,7 +712,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
if self._name is NA:
self._name = nvml.nvmlQuery('nvmlDeviceGetName', self.handle)
self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self.handle)
return self._name
def uuid(self) -> Union[str, NaType]:
@ -730,7 +730,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
if self._uuid is NA:
self._uuid = nvml.nvmlQuery('nvmlDeviceGetUUID', self.handle)
self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self.handle)
return self._uuid
def bus_id(self) -> Union[str, NaType]:
@ -747,7 +747,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
if self._bus_id is NA:
self._bus_id = nvml.nvmlQuery(lambda handle: nvml.nvmlDeviceGetPciInfo(handle).busId, self.handle)
self._bus_id = libnvml.nvmlQuery(lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId, self.handle)
return self._bus_id
def serial(self) -> Union[str, NaType]:
@ -764,7 +764,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=serial
"""
return nvml.nvmlQuery('nvmlDeviceGetSerial', self.handle)
return libnvml.nvmlQuery('nvmlDeviceGetSerial', self.handle)
@memoize_when_activated
@ttl_cache(ttl=1.0)
@ -775,8 +775,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
"""
memory_info = nvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self.handle)
if nvml.nvmlCheckReturn(memory_info):
memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self.handle)
if libnvml.nvmlCheckReturn(memory_info):
return MemoryInfo(total=memory_info.total, free=memory_info.free, used=memory_info.used)
return MemoryInfo(total=NA, free=NA, used=NA)
@ -864,7 +864,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
memory_info = self.memory_info()
if nvml.nvmlCheckReturn(memory_info.used, int) and nvml.nvmlCheckReturn(memory_info.total, int):
if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn(memory_info.total, int):
return round(100.0 * memory_info.used / memory_info.total, 1)
return NA
@ -886,8 +886,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
A named tuple with BAR1 memory information, the item could be :const:`nvitop.NA` when not applicable.
""" # pylint: disable=line-too-long
memory_info = nvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self.handle)
if nvml.nvmlCheckReturn(memory_info):
memory_info = libnvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self.handle)
if libnvml.nvmlCheckReturn(memory_info):
return MemoryInfo(total=memory_info.bar1Total, free=memory_info.bar1Free, used=memory_info.bar1Used)
return MemoryInfo(total=NA, free=NA, used=NA)
@ -953,7 +953,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
""" # pylint: disable=line-too-long
memory_info = self.bar1_memory_info()
if nvml.nvmlCheckReturn(memory_info.used, int) and nvml.nvmlCheckReturn(memory_info.total, int):
if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn(memory_info.total, int):
return round(100.0 * memory_info.used / memory_info.total, 1)
return NA
@ -977,16 +977,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
gpu, memory, encoder, decoder = NA, NA, NA, NA
utilization_rates = nvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self.handle)
if nvml.nvmlCheckReturn(utilization_rates):
utilization_rates = libnvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self.handle)
if libnvml.nvmlCheckReturn(utilization_rates):
gpu, memory = utilization_rates.gpu, utilization_rates.memory
encoder_utilization = nvml.nvmlQuery('nvmlDeviceGetEncoderUtilization', self.handle)
if nvml.nvmlCheckReturn(encoder_utilization, list) and len(encoder_utilization) > 0:
encoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetEncoderUtilization', self.handle)
if libnvml.nvmlCheckReturn(encoder_utilization, list) and len(encoder_utilization) > 0:
encoder = encoder_utilization[0]
decoder_utilization = nvml.nvmlQuery('nvmlDeviceGetDecoderUtilization', self.handle)
if nvml.nvmlCheckReturn(decoder_utilization, list) and len(decoder_utilization) > 0:
decoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetDecoderUtilization', self.handle)
if libnvml.nvmlCheckReturn(decoder_utilization, list) and len(decoder_utilization) > 0:
decoder = decoder_utilization[0]
return UtilizationRates(gpu=gpu, memory=memory, encoder=encoder, decoder=decoder)
@ -1053,10 +1053,10 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
""" # pylint: disable=line-too-long
return ClockInfos(
graphics=nvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, nvml.NVML_CLOCK_GRAPHICS),
sm=nvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, nvml.NVML_CLOCK_SM),
memory=nvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, nvml.NVML_CLOCK_MEM),
video=nvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, nvml.NVML_CLOCK_VIDEO)
graphics=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_GRAPHICS),
sm=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_SM),
memory=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_MEM),
video=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_VIDEO)
)
clocks = clock_infos
@ -1073,8 +1073,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
clock_infos = self._max_clock_infos._asdict()
for name, clock in clock_infos.items():
if clock is NA:
clock_type = getattr(nvml, 'NVML_CLOCK_{}'.format(name.replace('memory', 'mem').upper()))
clock = nvml.nvmlQuery('nvmlDeviceGetMaxClockInfo', self.handle, clock_type)
clock_type = getattr(libnvml, 'NVML_CLOCK_{}'.format(name.replace('memory', 'mem').upper()))
clock = libnvml.nvmlQuery('nvmlDeviceGetMaxClockInfo', self.handle, clock_type)
clock_infos[name] = clock
self._max_clock_infos = ClockInfos(**clock_infos)
return self._max_clock_infos
@ -1228,7 +1228,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=fan.speed
"""
return nvml.nvmlQuery('nvmlDeviceGetFanSpeed', self.handle)
return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self.handle)
@ttl_cache(ttl=5.0)
def temperature(self) -> Union[int, NaType]: # in Celsius
@ -1244,7 +1244,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=temperature.gpu
"""
return nvml.nvmlQuery('nvmlDeviceGetTemperature', self.handle, nvml.NVML_TEMPERATURE_GPU)
return libnvml.nvmlQuery('nvmlDeviceGetTemperature', self.handle, libnvml.NVML_TEMPERATURE_GPU)
@memoize_when_activated
@ttl_cache(ttl=5.0)
@ -1261,7 +1261,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 ))
"""
return nvml.nvmlQuery('nvmlDeviceGetPowerUsage', self.handle)
return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self.handle)
power_draw = power_usage # in milliwatts (mW)
@ -1280,7 +1280,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 ))
"""
return nvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self.handle)
return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self.handle)
def power_status(self) -> str: # string of power usage over power limit in watts (W)
"""The string of power usage over power limit in watts.
@ -1291,9 +1291,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
power_usage = self.power_usage()
power_limit = self.power_limit()
if nvml.nvmlCheckReturn(power_usage, int):
if libnvml.nvmlCheckReturn(power_usage, int):
power_usage = '{}W'.format(round(power_usage / 1000.0))
if nvml.nvmlCheckReturn(power_limit, int):
if libnvml.nvmlCheckReturn(power_limit, int):
power_limit = '{}W'.format(round(power_limit / 1000.0))
return '{} / {}'.format(power_usage, power_limit)
@ -1315,7 +1315,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_active
"""
return {0: 'Disabled', 1: 'Enabled'}.get(nvml.nvmlQuery('nvmlDeviceGetDisplayActive', self.handle), NA)
return {0: 'Disabled', 1: 'Enabled'}.get(libnvml.nvmlQuery('nvmlDeviceGetDisplayActive', self.handle), NA)
@ttl_cache(ttl=60.0)
def display_mode(self) -> Union[str, NaType]:
@ -1335,7 +1335,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_mode
"""
return {0: 'Disabled', 1: 'Enabled'}.get(nvml.nvmlQuery('nvmlDeviceGetDisplayMode', self.handle), NA)
return {0: 'Disabled', 1: 'Enabled'}.get(libnvml.nvmlQuery('nvmlDeviceGetDisplayMode', self.handle), NA)
@ttl_cache(ttl=60.0)
def current_driver_model(self) -> Union[str, NaType]:
@ -1358,9 +1358,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
return {
nvml.NVML_DRIVER_WDDM: 'WDDM',
nvml.NVML_DRIVER_WDM: 'WDM',
}.get(nvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self.handle), NA)
libnvml.NVML_DRIVER_WDDM: 'WDDM',
libnvml.NVML_DRIVER_WDM: 'WDM',
}.get(libnvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self.handle), NA)
driver_model = current_driver_model
@ -1383,7 +1383,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=persistence_mode
"""
return {0: 'Disabled', 1: 'Enabled'}.get(nvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self.handle), NA)
return {0: 'Disabled', 1: 'Enabled'}.get(libnvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self.handle), NA)
@ttl_cache(ttl=5.0)
def performance_state(self) -> Union[str, NaType]:
@ -1400,8 +1400,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pstate
"""
performance_state = nvml.nvmlQuery('nvmlDeviceGetPerformanceState', self.handle)
if nvml.nvmlCheckReturn(performance_state, int):
performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self.handle)
if libnvml.nvmlCheckReturn(performance_state, int):
performance_state = 'P' + str(performance_state)
return performance_state
@ -1419,9 +1419,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=ecc.errors.uncorrected.volatile.total
""" # pylint: disable=line-too-long
return nvml.nvmlQuery('nvmlDeviceGetTotalEccErrors', self.handle,
nvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
nvml.NVML_VOLATILE_ECC)
return libnvml.nvmlQuery('nvmlDeviceGetTotalEccErrors', self.handle,
libnvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
libnvml.NVML_VOLATILE_ECC)
@ttl_cache(ttl=60.0)
def compute_mode(self) -> Union[str, NaType]:
@ -1443,18 +1443,18 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
""" # pylint: disable=line-too-long
return {
nvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
nvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread',
nvml.NVML_COMPUTEMODE_PROHIBITED: 'Prohibited',
nvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 'Exclusive Process',
}.get(nvml.nvmlQuery('nvmlDeviceGetComputeMode', self.handle), NA)
libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
libnvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread',
libnvml.NVML_COMPUTEMODE_PROHIBITED: 'Prohibited',
libnvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 'Exclusive Process',
}.get(libnvml.nvmlQuery('nvmlDeviceGetComputeMode', self.handle), NA)
def is_mig_device(self) -> bool:
"""Returns whether or not the device is a MIG device."""
if self._is_mig_device is None:
is_mig_device = nvml.nvmlQuery('nvmlDeviceIsMigDeviceHandle', self.handle,
default=False, ignore_function_not_found=True)
is_mig_device = libnvml.nvmlQuery('nvmlDeviceIsMigDeviceHandle', self.handle,
default=False, ignore_function_not_found=True)
self._is_mig_device = bool(is_mig_device) # nvmlDeviceIsMigDeviceHandle returns c_uint
return self._is_mig_device
@ -1477,8 +1477,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
if self.is_mig_device():
return NA
mig_mode = nvml.nvmlQuery('nvmlDeviceGetMigMode', self.handle,
default=(NA, NA), ignore_function_not_found=True)[0]
mig_mode = libnvml.nvmlQuery('nvmlDeviceGetMigMode', self.handle,
default=(NA, NA), ignore_function_not_found=True)[0]
return {0: 'Disabled', 1: 'Enabled'}.get(mig_mode, NA)
def is_mig_mode_enabled(self) -> bool:
@ -1528,7 +1528,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
for type, func in [('C', 'nvmlDeviceGetComputeRunningProcesses'), # pylint: disable=redefined-builtin
('G', 'nvmlDeviceGetGraphicsRunningProcesses')]:
for p in nvml.nvmlQuery(func, self.handle, default=()): # pylint: disable=invalid-name
for p in libnvml.nvmlQuery(func, self.handle, default=()): # pylint: disable=invalid-name
proc = processes[p.pid] = self.GPU_PROCESS_CLASS(
pid=p.pid, device=self,
gpu_memory=(p.usedGpuMemory if isinstance(p.usedGpuMemory, int)
@ -1539,7 +1539,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
proc.type = proc.type + type
if len(processes) > 0:
samples = nvml.nvmlQuery('nvmlDeviceGetProcessUtilization', self.handle, self._timestamp, default=())
samples = libnvml.nvmlQuery('nvmlDeviceGetProcessUtilization', self.handle, self._timestamp, default=())
self._timestamp = max(min((s.timeStamp for s in samples), default=0) - 500000, 0)
for s in samples: # pylint: disable=invalid-name
try:
@ -1667,15 +1667,15 @@ class PhysicalDevice(Device):
does not support MIG mode.
"""
return nvml.nvmlQuery('nvmlDeviceGetMaxMigDeviceCount', self.handle,
default=0, ignore_function_not_found=True)
return libnvml.nvmlQuery('nvmlDeviceGetMaxMigDeviceCount', self.handle,
default=0, ignore_function_not_found=True)
@ttl_cache(ttl=60.0)
def mig_device(self, mig_index: int) -> 'MigDevice':
"""Returns a child MIG device of the given index.
Raises:
nvml.NVMLError:
libnvml.NVMLError:
If the device does not support MIG mode or the given MIG device does not exist.
"""
@ -1696,7 +1696,7 @@ class PhysicalDevice(Device):
for mig_index in range(max_mig_device_count):
try:
mig_device = MigDevice(index=(self.index, mig_index))
except nvml.NVMLError:
except libnvml.NVMLError:
break
else:
mig_devices.append(mig_device)
@ -1787,22 +1787,22 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
self._parent = parent
if self.parent.handle is not None:
try:
self._handle = nvml.nvmlQuery('nvmlDeviceGetMigDeviceHandleByIndex',
self.parent.handle, self.mig_index, ignore_errors=False)
except nvml.NVMLError_GpuIsLost: # pylint: disable=no-member
self._handle = libnvml.nvmlQuery('nvmlDeviceGetMigDeviceHandleByIndex',
self.parent.handle, self.mig_index, ignore_errors=False)
except libnvml.NVMLError_GpuIsLost: # pylint: disable=no-member
pass
else:
self._handle = nvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False)
parent_handle = nvml.nvmlQuery('nvmlDeviceGetDeviceHandleFromMigDeviceHandle',
self.handle, ignore_errors=False)
parent_index = nvml.nvmlQuery('nvmlDeviceGetIndex', parent_handle, ignore_errors=False)
self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False)
parent_handle = libnvml.nvmlQuery('nvmlDeviceGetDeviceHandleFromMigDeviceHandle',
self.handle, ignore_errors=False)
parent_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', parent_handle, ignore_errors=False)
self._parent = PhysicalDevice(index=parent_index)
for mig_device in self.parent.mig_devices():
if self.uuid() == mig_device.uuid():
self._nvml_index = mig_device.index
break
else:
raise nvml.NVMLError_NotFound() # pylint: disable=no-member
raise libnvml.NVMLError_NotFound() # pylint: disable=no-member
self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
self._timestamp = 0
@ -1843,8 +1843,8 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
"""
if self._gpu_instance_id is NA:
self._gpu_instance_id = nvml.nvmlQuery('nvmlDeviceGetGpuInstanceId', self.handle,
default=0xFFFFFFFF)
self._gpu_instance_id = libnvml.nvmlQuery('nvmlDeviceGetGpuInstanceId', self.handle,
default=0xFFFFFFFF)
if self._gpu_instance_id == 0xFFFFFFFF:
self._gpu_instance_id = NA
return self._gpu_instance_id
@ -1857,8 +1857,8 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes
"""
if self._compute_instance_id is NA:
self._compute_instance_id = nvml.nvmlQuery('nvmlDeviceGetComputeInstanceId', self.handle,
default=0xFFFFFFFF)
self._compute_instance_id = libnvml.nvmlQuery('nvmlDeviceGetComputeInstanceId', self.handle,
default=0xFFFFFFFF)
if self._compute_instance_id == 0xFFFFFFFF:
self._compute_instance_id = NA
return self._compute_instance_id
@ -2016,9 +2016,9 @@ class CudaDevice(Device):
"""Initializes the instance created by ``__new__()``.
Raises:
nvml.NVMLError_NotFound:
libnvml.NVMLError_NotFound:
If the device is not found for the given NVML identifier.
nvml.NVMLError_InvalidArgument:
libnvml.NVMLError_InvalidArgument:
If the NVML index is out of range.
RuntimeError:
The given device is not visible to CUDA applications.

View file

@ -19,11 +19,13 @@ import pynvml
from nvitop.core.utils import NA, colored
__all__ = ['libnvml', 'nvml', 'nvmlCheckReturn', 'NVMLError']
__all__ = ['LibnvmlSingleton', 'libnvml', 'nvmlCheckReturn', 'NVMLError']
class libnvml:
"""The helper singleton class that holds members from package ``nvidia-ml-py``."""
class LibnvmlSingleton:
"""The helper singleton class that holds members from
package `nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_.
"""
NVMLError = pynvml.NVMLError
"""Base exception class for NVML query errors."""
@ -35,8 +37,8 @@ class libnvml:
c_nvmlDevice_t = pynvml.c_nvmlDevice_t
def __new__(cls) -> 'libnvml':
"""Gets the singleton instance of :class:`libnvml`."""
def __new__(cls) -> 'LibnvmlSingleton':
"""Gets the singleton instance of :class:`LibnvmlSingleton`."""
if not hasattr(cls, '_instance'):
instance = cls._instance = super().__new__(cls)
@ -60,10 +62,10 @@ class libnvml:
try:
self.nvmlShutdown()
except nvml.NVMLError:
except libnvml.NVMLError:
pass
def __enter__(self) -> 'libnvml':
def __enter__(self) -> 'LibnvmlSingleton':
"""Entry of the context manager for ``with`` statement."""
self._lazy_init()
@ -131,7 +133,7 @@ class libnvml:
try:
pynvml.nvmlInitWithFlags(flags)
except nvml.NVMLError_LibraryNotFound: # pylint: disable=no-member
except libnvml.NVMLError_LibraryNotFound: # pylint: disable=no-member
message = '\n'.join((
'FATAL ERROR: NVIDIA Management Library (NVML) not found.',
'HINT: The NVIDIA Management Library ships with the NVIDIA display driver (available at',
@ -225,10 +227,10 @@ class libnvml:
try:
func = getattr(self, func)
except AttributeError as e1:
raise nvml.NVMLError_FunctionNotFound from e1 # pylint: disable=no-member
raise libnvml.NVMLError_FunctionNotFound from e1 # pylint: disable=no-member
retval = func(*args, **kwargs)
except nvml.NVMLError_FunctionNotFound as e2: # pylint: disable=no-member
except libnvml.NVMLError_FunctionNotFound as e2: # pylint: disable=no-member
if not ignore_function_not_found:
if identifier.__name__ == '<lambda>':
identifier = inspect.getsource(func)
@ -249,7 +251,7 @@ class libnvml:
if ignore_errors or ignore_function_not_found:
return default
raise
except nvml.NVMLError:
except libnvml.NVMLError:
if ignore_errors:
return default
raise
@ -267,9 +269,9 @@ class libnvml:
return retval != NA and isinstance(retval, types)
nvml = libnvml()
"""The singleton instance of class :class:`libnvml`."""
libnvml = LibnvmlSingleton()
"""The singleton instance of class :class:`LibnvmlSingleton`."""
nvmlCheckReturn = nvml.nvmlCheckReturn
nvmlCheckReturn = libnvml.nvmlCheckReturn
NVMLError = nvml.NVMLError
NVMLError = libnvml.NVMLError

View file

@ -16,7 +16,7 @@ from typing import List, Tuple, Dict, Iterable, Callable, Union, Optional, Type,
from weakref import WeakValueDictionary
from nvitop.core import host
from nvitop.core.libnvml import nvml
from nvitop.core.libnvml import libnvml
from nvitop.core.utils import (NA, NaType, Snapshot,
bytes2human, timedelta2human,
memoize_when_activated)
@ -602,7 +602,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
self._gpu_memory_human = bytes2human(self.gpu_memory()) # pylint: disable=attribute-defined-outside-init
memory_total = self.device.memory_total()
gpu_memory_percent = NA
if nvml.nvmlCheckReturn(memory_used, int) and nvml.nvmlCheckReturn(memory_total, int):
if libnvml.nvmlCheckReturn(memory_used, int) and libnvml.nvmlCheckReturn(memory_total, int):
gpu_memory_percent = round(100.0 * memory_used / memory_total, 1)
self._gpu_memory_percent = gpu_memory_percent # pylint: disable=attribute-defined-outside-init

View file

@ -3,7 +3,7 @@
# pylint: disable=missing-module-docstring,missing-class-docstring,missing-function-docstring
from nvitop.core import (nvml, PhysicalDevice as DeviceBase, MigDevice as MigDeviceBase,
from nvitop.core import (libnvml, PhysicalDevice as DeviceBase, MigDevice as MigDeviceBase,
NA, Snapshot, utilization2string)
from nvitop.gui.library.process import GpuProcess
@ -70,7 +70,7 @@ class Device(DeviceBase):
for mig_index in range(self.max_mig_device_count()):
try:
mig_device = MigDevice(index=(self.index, mig_index))
except nvml.NVMLError:
except libnvml.NVMLError:
break
else:
mig_devices.append(mig_device)
@ -91,7 +91,7 @@ class Device(DeviceBase):
def temperature_string(self): # in Celsius
temperature = self.temperature()
if nvml.nvmlCheckReturn(temperature, int):
if libnvml.nvmlCheckReturn(temperature, int):
temperature = str(temperature) + 'C'
return temperature