mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-21 06:45:24 -06:00
2663 lines
103 KiB
Python
2663 lines
103 KiB
Python
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
|
#
|
|
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""The live classes for GPU devices.
|
|
|
|
The core classes are :class:`Device` and :class:`CudaDevice` (also aliased as :attr:`Device.cuda`).
|
|
The type of returned instance created by ``Class(args)`` is depending on the given arguments.
|
|
|
|
``Device()`` returns:
|
|
|
|
.. code-block:: python
|
|
|
|
- (index: int) -> PhysicalDevice
|
|
- (index: (int, int)) -> MigDevice
|
|
- (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value
|
|
- (bus_id: str) -> PhysicalDevice
|
|
|
|
``CudaDevice()`` returns:
|
|
|
|
.. code-block:: python
|
|
|
|
- (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (nvml_index: int) -> CudaDevice
|
|
- (nvml_index: (int, int)) -> CudaMigDevice
|
|
|
|
Examples:
|
|
>>> from nvitop import Device, CudaDevice
|
|
>>> Device.driver_version() # version of the installed NVIDIA display driver
|
|
'470.129.06'
|
|
|
|
>>> Device.count() # number of NVIDIA GPUs in the system
|
|
10
|
|
|
|
>>> Device.all() # all physical devices in the system
|
|
[
|
|
PhysicalDevice(index=0, ...),
|
|
PhysicalDevice(index=1, ...),
|
|
...
|
|
]
|
|
|
|
>>> nvidia0 = Device(index=0) # -> PhysicalDevice
|
|
>>> mig10 = Device(index=(1, 0)) # -> MigDevice
|
|
>>> nvidia2 = Device(uuid='GPU-xxxxxx') # -> PhysicalDevice
|
|
>>> mig30 = Device(uuid='MIG-xxxxxx') # -> MigDevice
|
|
|
|
>>> nvidia0.memory_free() # total free memory in bytes
|
|
11550654464
|
|
>>> nvidia0.memory_free_human() # total free memory in human readable format
|
|
'11016MiB'
|
|
|
|
>>> nvidia2.as_snapshot() # takes an onetime snapshot of the device
|
|
PhysicalDeviceSnapshot(
|
|
real=PhysicalDevice(index=2, ...),
|
|
...
|
|
)
|
|
|
|
>>> import os
|
|
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'
|
|
|
|
>>> CudaDevice.count() # number of NVIDIA GPUs visible to CUDA applications
|
|
4
|
|
>>> Device.cuda.count() # use alias in class `Device`
|
|
4
|
|
|
|
>>> CudaDevice.all() # all CUDA visible devices (or `Device.cuda.all()`)
|
|
[
|
|
CudaDevice(cuda_index=0, nvml_index=3, ...),
|
|
CudaDevice(cuda_index=1, nvml_index=2, ...),
|
|
...
|
|
]
|
|
|
|
>>> cuda0 = CudaDevice(cuda_index=0) # use CUDA ordinal (or `Device.cuda(0)`)
|
|
>>> cuda1 = CudaDevice(nvml_index=2) # use NVML ordinal
|
|
>>> cuda2 = CudaDevice(uuid='GPU-xxxxxx') # use UUID string
|
|
|
|
>>> cuda0.memory_free() # total free memory in bytes
|
|
11550654464
|
|
>>> cuda0.memory_free_human() # total free memory in human readable format
|
|
'11016MiB'
|
|
|
|
>>> cuda1.as_snapshot() # takes an onetime snapshot of the device
|
|
CudaDeviceSnapshot(
|
|
real=CudaDevice(cuda_index=1, nvml_index=2, ...),
|
|
...
|
|
)
|
|
"""
|
|
|
|
# pylint: disable=too-many-lines
|
|
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import functools
|
|
import multiprocessing as mp
|
|
import os
|
|
import re
|
|
import threading
|
|
from collections import OrderedDict
|
|
from typing import Any, Callable, Iterable, NamedTuple
|
|
|
|
from nvitop.api import libcuda, libcudart, libnvml
|
|
from nvitop.api.process import GpuProcess
|
|
from nvitop.api.utils import NA, NaType, Snapshot, boolify, bytes2human, memoize_when_activated
|
|
|
|
|
|
__all__ = [
|
|
'Device',
|
|
'PhysicalDevice',
|
|
'MigDevice',
|
|
'CudaDevice',
|
|
'CudaMigDevice',
|
|
'parse_cuda_visible_devices',
|
|
'normalize_cuda_visible_devices',
|
|
]
|
|
|
|
# Class definitions ################################################################################
|
|
|
|
|
|
class MemoryInfo(NamedTuple): # in bytes # pylint: disable=missing-class-docstring
|
|
total: int | NaType
|
|
free: int | NaType
|
|
used: int | NaType
|
|
|
|
|
|
class ClockInfos(NamedTuple): # in MHz # pylint: disable=missing-class-docstring
|
|
graphics: int | NaType
|
|
sm: int | NaType
|
|
memory: int | NaType
|
|
video: int | NaType
|
|
|
|
|
|
class ClockSpeedInfos(NamedTuple): # pylint: disable=missing-class-docstring
|
|
current: ClockInfos
|
|
max: ClockInfos
|
|
|
|
|
|
class UtilizationRates(NamedTuple): # in percentage # pylint: disable=missing-class-docstring
|
|
gpu: int | NaType
|
|
memory: int | NaType
|
|
encoder: int | NaType
|
|
decoder: int | NaType
|
|
|
|
|
|
_VALUE_OMITTED = object()
|
|
|
|
|
|
class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods
|
|
"""Live class of the GPU devices, different from the device snapshots.
|
|
|
|
:meth:`Device.__new__()` returns different types depending on the given arguments.
|
|
|
|
.. code-block:: python
|
|
|
|
- (index: int) -> PhysicalDevice
|
|
- (index: (int, int)) -> MigDevice
|
|
- (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value
|
|
- (bus_id: str) -> PhysicalDevice
|
|
|
|
Examples:
|
|
>>> Device.driver_version() # version of the installed NVIDIA display driver
|
|
'470.129.06'
|
|
|
|
>>> Device.count() # number of NVIDIA GPUs in the system
|
|
10
|
|
|
|
>>> Device.all() # all physical devices in the system
|
|
[
|
|
PhysicalDevice(index=0, ...),
|
|
PhysicalDevice(index=1, ...),
|
|
...
|
|
]
|
|
|
|
>>> nvidia0 = Device(index=0) # -> PhysicalDevice
|
|
>>> mig10 = Device(index=(1, 0)) # -> MigDevice
|
|
>>> nvidia2 = Device(uuid='GPU-xxxxxx') # -> PhysicalDevice
|
|
>>> mig30 = Device(uuid='MIG-xxxxxx') # -> MigDevice
|
|
|
|
>>> nvidia0.memory_free() # total free memory in bytes
|
|
11550654464
|
|
>>> nvidia0.memory_free_human() # total free memory in human readable format
|
|
'11016MiB'
|
|
|
|
>>> nvidia2.as_snapshot() # takes an onetime snapshot of the device
|
|
PhysicalDeviceSnapshot(
|
|
real=PhysicalDevice(index=2, ...),
|
|
...
|
|
)
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
libnvml.NVMLError_InvalidArgument:
|
|
If the device index is out of range.
|
|
TypeError:
|
|
If the number of non-None arguments is not exactly 1.
|
|
TypeError:
|
|
If the given index is a tuple but is not consist of two integers.
|
|
"""
|
|
|
|
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
|
|
# https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices
|
|
# GPU UUID : `GPU-<GPU-UUID>`
|
|
# MIG UUID : `MIG-GPU-<GPU-UUID>/<GPU instance ID>/<compute instance ID>`
|
|
# MIG UUID (R470+): `MIG-<MIG-UUID>`
|
|
UUID_PATTERN = re.compile(
|
|
r"""^ # full match
|
|
(?:(?P<MigMode>MIG)-)? # prefix for MIG UUID
|
|
(?:(?P<GpuUuid>GPU)-)? # prefix for GPU UUID
|
|
(?(MigMode)|(?(GpuUuid)|GPU-)) # always have a prefix
|
|
(?P<UUID>[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}) # UUID for the GPU/MIG device in lower case
|
|
# Suffix for MIG device while using GPU UUID with GPU instance (GI) ID and compute instance (CI) ID
|
|
(?(MigMode) # match only when the MIG prefix matches
|
|
(?(GpuUuid) # match only when provide with GPU UUID
|
|
/(?P<GpuInstanceId>\d+) # GI ID of the MIG device
|
|
/(?P<ComputeInstanceId>\d+) # CI ID of the MIG device
|
|
|)
|
|
|)
|
|
$""", # full match
|
|
flags=re.VERBOSE,
|
|
)
|
|
|
|
GPU_PROCESS_CLASS = GpuProcess
|
|
cuda = None # defined in below
|
|
"""Shortcut for class :class:`CudaDevice`."""
|
|
|
|
@classmethod
|
|
def is_available(cls) -> bool:
|
|
"""Test whether there are any devices and the NVML library is successfully loaded."""
|
|
try:
|
|
return cls.count() > 0
|
|
except libnvml.NVMLError:
|
|
return False
|
|
|
|
@staticmethod
|
|
def driver_version() -> str | NaType:
|
|
"""The version of the installed NVIDIA display driver. This is an alphanumeric string.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=driver_version
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
"""
|
|
return libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
|
|
|
|
@staticmethod
|
|
def cuda_driver_version() -> str | NaType:
|
|
"""The maximum CUDA version supported by the NVIDIA display driver. This is an alphanumeric string.
|
|
|
|
This can be different from the version of the CUDA Runtime. See also :meth:`cuda_runtime_version`.
|
|
|
|
Returns: Union[str, NaType]
|
|
The maximum CUDA version supported by the NVIDIA display driver.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
"""
|
|
cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion')
|
|
if libnvml.nvmlCheckReturn(cuda_driver_version, int):
|
|
major = cuda_driver_version // 1000
|
|
minor = (cuda_driver_version % 1000) // 10
|
|
revision = cuda_driver_version % 10
|
|
if revision == 0:
|
|
return f'{major}.{minor}'
|
|
return f'{major}.{minor}.{revision}'
|
|
return NA
|
|
|
|
max_cuda_version = cuda_driver_version
|
|
|
|
@staticmethod
|
|
def cuda_runtime_version() -> str | NaType:
|
|
"""The CUDA Runtime version. This is an alphanumeric string.
|
|
|
|
This can be different from the CUDA driver version. See also :meth:`cuda_driver_version`.
|
|
|
|
Returns: Union[str, NaType]
|
|
The CUDA Runtime version, or :const:`nvitop.NA` when no CUDA Runtime is available or no
|
|
CUDA-capable devices are present.
|
|
"""
|
|
try:
|
|
return libcudart.cudaRuntimeGetVersion()
|
|
except libcudart.cudaError:
|
|
return NA
|
|
|
|
cudart_version = cuda_runtime_version
|
|
|
|
@classmethod
|
|
def count(cls) -> int:
|
|
"""The number of NVIDIA GPUs in the system.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=count
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
"""
|
|
return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
|
|
|
|
@classmethod
|
|
def all(cls) -> list[PhysicalDevice]:
|
|
"""Return a list of all physical devices in the system."""
|
|
return cls.from_indices()
|
|
|
|
@classmethod
|
|
def from_indices(
|
|
cls,
|
|
indices: int | Iterable[int | tuple[int, int]] | None = None,
|
|
) -> list[PhysicalDevice | MigDevice]:
|
|
"""Return a list of devices of the given indices.
|
|
|
|
Args:
|
|
indices (Iterable[Union[int, Tuple[int, int]]]):
|
|
Indices of the devices. For each index, get :class:`PhysicalDevice` for single int
|
|
and :class:`MigDevice` for tuple (int, int). That is:
|
|
- (int) -> PhysicalDevice
|
|
- ((int, int)) -> MigDevice
|
|
|
|
Returns: List[Union[PhysicalDevice, MigDevice]]
|
|
A list of :class:`PhysicalDevice` and/or :class:`MigDevice` instances of the given indices.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
libnvml.NVMLError_InvalidArgument:
|
|
If the device index is out of range.
|
|
"""
|
|
if indices is None:
|
|
try:
|
|
indices = range(cls.count())
|
|
except libnvml.NVMLError:
|
|
return []
|
|
|
|
if isinstance(indices, int):
|
|
indices = [indices]
|
|
|
|
return list(map(cls, indices))
|
|
|
|
@staticmethod
|
|
def from_cuda_visible_devices() -> list[CudaDevice]:
|
|
"""Return a list of all CUDA visible devices.
|
|
|
|
The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
|
|
Note:
|
|
The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Returns: List[CudaDevice]
|
|
A list of :class:`CudaDevice` instances.
|
|
""" # pylint: disable=line-too-long
|
|
visible_device_indices = Device.parse_cuda_visible_devices()
|
|
|
|
cuda_devices = []
|
|
for cuda_index, device_index in enumerate(visible_device_indices):
|
|
cuda_devices.append(CudaDevice(cuda_index, nvml_index=device_index))
|
|
|
|
return cuda_devices
|
|
|
|
@staticmethod
|
|
def from_cuda_indices(
|
|
cuda_indices: int | Iterable[int] | None = None,
|
|
) -> list[CudaDevice]:
|
|
"""Return a list of CUDA devices of the given CUDA indices.
|
|
|
|
The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_indices (Iterable[int]):
|
|
The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices.
|
|
|
|
Returns: List[CudaDevice]
|
|
A list of :class:`CudaDevice` of the given CUDA indices.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
RuntimeError:
|
|
If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
""" # pylint: disable=line-too-long
|
|
cuda_devices = Device.from_cuda_visible_devices()
|
|
if cuda_indices is None:
|
|
return cuda_devices
|
|
|
|
if isinstance(cuda_indices, int):
|
|
cuda_indices = [cuda_indices]
|
|
|
|
cuda_indices = list(cuda_indices)
|
|
cuda_device_count = len(cuda_devices)
|
|
|
|
devices = []
|
|
for cuda_index in cuda_indices:
|
|
if not 0 <= cuda_index < cuda_device_count:
|
|
raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.')
|
|
device = cuda_devices[cuda_index]
|
|
devices.append(device)
|
|
|
|
return devices
|
|
|
|
@staticmethod
|
|
def parse_cuda_visible_devices(
|
|
cuda_visible_devices: str | None = _VALUE_OMITTED,
|
|
) -> list[int] | list[tuple[int, int]]:
|
|
"""Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices.
|
|
|
|
This is a alias of :func:`parse_cuda_visible_devices`.
|
|
|
|
Note:
|
|
The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_visible_devices (Optional[str]):
|
|
The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the
|
|
environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES``
|
|
environment variable will be unset before parsing.
|
|
|
|
Returns: Union[List[int], List[Tuple[int, int]]]
|
|
A list of int (physical device) or a list of tuple of two integers (MIG device) for the
|
|
corresponding real device indices.
|
|
""" # pylint: disable=line-too-long
|
|
return parse_cuda_visible_devices(cuda_visible_devices)
|
|
|
|
@staticmethod
|
|
def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str:
|
|
"""Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs.
|
|
|
|
This is an alias of :func:`normalize_cuda_visible_devices`.
|
|
|
|
Note:
|
|
The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_visible_devices (Optional[str]):
|
|
The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the
|
|
environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES``
|
|
environment variable will be unset before parsing.
|
|
|
|
Returns: str
|
|
The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
""" # pylint: disable=line-too-long
|
|
return normalize_cuda_visible_devices(cuda_visible_devices)
|
|
|
|
def __new__(
|
|
cls,
|
|
index: int | tuple[int, int] | str | None = None,
|
|
*,
|
|
uuid: str | None = None,
|
|
bus_id: str | None = None,
|
|
) -> Device:
|
|
"""Create a new instance of Device.
|
|
|
|
The type of the result is determined by the given argument.
|
|
|
|
.. code-block:: python
|
|
|
|
- (index: int) -> PhysicalDevice
|
|
- (index: (int, int)) -> MigDevice
|
|
- (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value
|
|
- (bus_id: str) -> PhysicalDevice
|
|
|
|
Note: This method takes exact 1 non-None argument.
|
|
|
|
Returns: Union[PhysicalDevice, MigDevice]
|
|
A :class:`PhysicalDevice` instance or a :class:`MigDevice` instance.
|
|
|
|
Raises:
|
|
TypeError:
|
|
If the number of non-None arguments is not exactly 1.
|
|
TypeError:
|
|
If the given index is a tuple but is not consist of two integers.
|
|
"""
|
|
if (index, uuid, bus_id).count(None) != 2:
|
|
raise TypeError(
|
|
f'Device(index=None, uuid=None, bus_id=None) takes 1 non-None arguments '
|
|
f'but (index, uuid, bus_id) = {(index, uuid, bus_id)!r} were given',
|
|
)
|
|
|
|
if cls is not Device:
|
|
return super().__new__(cls)
|
|
|
|
match = None
|
|
if isinstance(index, str):
|
|
match = cls.UUID_PATTERN.match(index)
|
|
if match is not None: # passed by UUID
|
|
index, uuid = None, index
|
|
elif isinstance(uuid, str):
|
|
match = cls.UUID_PATTERN.match(uuid)
|
|
|
|
if index is not None:
|
|
if not isinstance(index, int):
|
|
if not isinstance(index, tuple):
|
|
raise TypeError(
|
|
f'index must be an integer, or a tuple of two integers, or a valid UUID string, '
|
|
f'but index = {index!r} was given',
|
|
)
|
|
if not (
|
|
len(index) == 2 and isinstance(index[0], int) and isinstance(index[1], int)
|
|
):
|
|
raise TypeError(
|
|
f'index for MIG device must be a tuple of two integers '
|
|
f'but index = {index!r} was given',
|
|
)
|
|
return super().__new__(MigDevice)
|
|
elif uuid is not None and match is not None and match.group('MigMode') is not None:
|
|
return super().__new__(MigDevice)
|
|
return super().__new__(PhysicalDevice)
|
|
|
|
def __init__(
|
|
self,
|
|
index: int | str | None = None,
|
|
*,
|
|
uuid: str | None = None,
|
|
bus_id: str | None = None,
|
|
) -> None:
|
|
"""Initialize the instance created by :meth:`__new__()`.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
libnvml.NVMLError_InvalidArgument:
|
|
If the device index is out of range.
|
|
"""
|
|
if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID
|
|
index, uuid = None, index
|
|
|
|
index, uuid, bus_id = (
|
|
arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid, bus_id)
|
|
)
|
|
|
|
self._name = NA
|
|
self._uuid = NA
|
|
self._bus_id = NA
|
|
self._memory_total = NA
|
|
self._memory_total_human = NA
|
|
self._is_mig_device = None
|
|
self._cuda_index = None
|
|
self._cuda_compute_capability = None
|
|
|
|
if index is not None:
|
|
self._nvml_index = index
|
|
try:
|
|
self._handle = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetHandleByIndex',
|
|
index,
|
|
ignore_errors=False,
|
|
)
|
|
except libnvml.NVMLError_GpuIsLost:
|
|
self._handle = None
|
|
self._name = 'ERROR: GPU is Lost'
|
|
except libnvml.NVMLError_Unknown:
|
|
self._handle = None
|
|
self._name = 'ERROR: Unknown'
|
|
else:
|
|
try:
|
|
if uuid is not None:
|
|
self._handle = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetHandleByUUID',
|
|
uuid,
|
|
ignore_errors=False,
|
|
)
|
|
else:
|
|
self._handle = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetHandleByPciBusId',
|
|
bus_id,
|
|
ignore_errors=False,
|
|
)
|
|
except libnvml.NVMLError_GpuIsLost:
|
|
self._handle = None
|
|
self._nvml_index = NA
|
|
self._name = 'ERROR: GPU is Lost'
|
|
except libnvml.NVMLError_Unknown:
|
|
self._handle = None
|
|
self._nvml_index = NA
|
|
self._name = 'ERROR: Unknown'
|
|
else:
|
|
self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
|
|
|
|
self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
|
|
self._timestamp = 0
|
|
self._lock = threading.RLock()
|
|
|
|
self._ident = (self.index, self.uuid())
|
|
self._hash = None
|
|
|
|
def __repr__(self) -> str:
|
|
"""Return a string representation of the device."""
|
|
return '{}(index={}, name="{}", total_memory={})'.format(
|
|
self.__class__.__name__,
|
|
self.index,
|
|
self.name(),
|
|
self.memory_total_human(),
|
|
)
|
|
|
|
def __eq__(self, other: object) -> bool:
|
|
"""Test equality to other object."""
|
|
if not isinstance(other, Device):
|
|
return NotImplemented
|
|
return self._ident == other._ident
|
|
|
|
def __hash__(self) -> int:
|
|
"""Return a hash value of the device."""
|
|
if self._hash is None:
|
|
self._hash = hash(self._ident)
|
|
return self._hash
|
|
|
|
def __getattr__(self, name: str) -> Any | Callable[..., Any]:
|
|
"""Get the object attribute.
|
|
|
|
If the attribute is not defined, make a method from ``pynvml.nvmlDeviceGet<AttributeName>(handle)``.
|
|
The attribute name will be converted to PascalCase string.
|
|
|
|
Raises:
|
|
AttributeError:
|
|
If the attribute is not defined in ``pynvml.py``.
|
|
|
|
Examples:
|
|
>>> device = Device(0)
|
|
|
|
>>> # Method `cuda_compute_capability` is not implemented in the class definition
|
|
>>> PhysicalDevice.cuda_compute_capability
|
|
AttributeError: type object 'Device' has no attribute 'cuda_compute_capability'
|
|
|
|
>>> # Dynamically create a new method from `pynvml.nvmlDeviceGetCudaComputeCapability(device.handle, *args, **kwargs)`
|
|
>>> device.cuda_compute_capability
|
|
<function PhysicalDevice.cuda_compute_capability at 0x7fbfddf5d9d0>
|
|
|
|
>>> device.cuda_compute_capability()
|
|
(8, 6)
|
|
""" # pylint: disable=line-too-long
|
|
try:
|
|
return super().__getattr__(name)
|
|
except AttributeError:
|
|
if name == '_cache':
|
|
raise
|
|
if self._handle is None:
|
|
return lambda: NA
|
|
|
|
match = libnvml.VERSIONED_PATTERN.match(name)
|
|
if match is not None:
|
|
name = match.group('name')
|
|
suffix = match.group('suffix')
|
|
else:
|
|
suffix = ''
|
|
|
|
try:
|
|
pascal_case = name.title().replace('_', '')
|
|
func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix)
|
|
except AttributeError:
|
|
pascal_case = ''.join(
|
|
part[:1].upper() + part[1:] for part in filter(None, name.split('_'))
|
|
)
|
|
func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix)
|
|
|
|
def attribute(*args: Any, **kwargs: Any) -> Any:
|
|
try:
|
|
return libnvml.nvmlQuery(
|
|
func,
|
|
self._handle,
|
|
*args,
|
|
**kwargs,
|
|
ignore_errors=False,
|
|
)
|
|
except libnvml.NVMLError_NotSupported:
|
|
return NA
|
|
|
|
attribute.__name__ = name
|
|
attribute.__qualname__ = f'{self.__class__.__name__}.{name}'
|
|
setattr(self, name, attribute)
|
|
return attribute
|
|
|
|
def __reduce__(self) -> tuple[type[Device], tuple[int | tuple[int, int]]]:
|
|
"""Return state information for pickling."""
|
|
return self.__class__, (self._nvml_index,)
|
|
|
|
@property
|
|
def index(self) -> int | tuple[int, int]:
|
|
"""The NVML index of the device.
|
|
|
|
Returns: Union[int, Tuple[int, int]]
|
|
Returns an int for physical device and tuple of two integers for MIG device.
|
|
"""
|
|
return self._nvml_index
|
|
|
|
@property
|
|
def nvml_index(self) -> int | tuple[int, int]:
|
|
"""The NVML index of the device.
|
|
|
|
Returns: Union[int, Tuple[int, int]]
|
|
Returns an int for physical device and tuple of two integers for MIG device.
|
|
"""
|
|
return self._nvml_index
|
|
|
|
@property
|
|
def physical_index(self) -> int:
|
|
"""The index of the physical device.
|
|
|
|
Returns: int
|
|
An int for the physical device index. For MIG devices, returns the index of the parent
|
|
physical device.
|
|
"""
|
|
return self._nvml_index # will be overridden in MigDevice
|
|
|
|
@property
|
|
def handle(self) -> libnvml.c_nvmlDevice_t:
|
|
"""The NVML device handle."""
|
|
return self._handle
|
|
|
|
@property
|
|
def cuda_index(self) -> int:
|
|
"""The CUDA device index.
|
|
|
|
The value will be evaluated on the first call.
|
|
|
|
Raises:
|
|
RuntimeError:
|
|
If the current device is not visible to CUDA applications (i.e. not listed in the
|
|
``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid).
|
|
"""
|
|
if self._cuda_index is None:
|
|
visible_device_indices = self.parse_cuda_visible_devices()
|
|
try:
|
|
self._cuda_index = visible_device_indices.index(self.index)
|
|
except ValueError as ex:
|
|
raise RuntimeError(
|
|
f'CUDA Error: Device(index={self.index}) is not visible to CUDA applications',
|
|
) from ex
|
|
|
|
return self._cuda_index
|
|
|
|
def name(self) -> str | NaType:
|
|
"""The official product name of the GPU. This is an alphanumeric string. For all products.
|
|
|
|
Returns: Union[str, NaType]
|
|
The official product name, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
|
|
"""
|
|
if self._name is NA:
|
|
self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self.handle)
|
|
return self._name
|
|
|
|
def uuid(self) -> str | NaType:
|
|
"""This value is the globally unique immutable alphanumeric identifier of the GPU.
|
|
|
|
It does not correspond to any physical label on the board.
|
|
|
|
Returns: Union[str, NaType]
|
|
The UUID of the device, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name
|
|
"""
|
|
if self._uuid is NA:
|
|
self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self.handle)
|
|
return self._uuid
|
|
|
|
def bus_id(self) -> str | NaType:
|
|
"""PCI bus ID as "domain:bus:device.function", in hex.
|
|
|
|
Returns: Union[str, NaType]
|
|
The PCI bus ID of the device, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pci.bus_id
|
|
"""
|
|
if self._bus_id is NA:
|
|
self._bus_id = libnvml.nvmlQuery(
|
|
lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId,
|
|
self.handle,
|
|
)
|
|
return self._bus_id
|
|
|
|
def serial(self) -> str | NaType:
|
|
"""This number matches the serial number physically printed on each board.
|
|
|
|
It is a globally unique immutable alphanumeric value.
|
|
|
|
Returns: Union[str, NaType]
|
|
The serial number of the device, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=serial
|
|
"""
|
|
return libnvml.nvmlQuery('nvmlDeviceGetSerial', self.handle)
|
|
|
|
@memoize_when_activated
|
|
def memory_info(self) -> MemoryInfo: # in bytes
|
|
"""Return a named tuple with memory information (in bytes) for the device.
|
|
|
|
Returns: MemoryInfo(total, free, used)
|
|
A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self.handle)
|
|
if libnvml.nvmlCheckReturn(memory_info):
|
|
return MemoryInfo(total=memory_info.total, free=memory_info.free, used=memory_info.used)
|
|
return MemoryInfo(total=NA, free=NA, used=NA)
|
|
|
|
def memory_total(self) -> int | NaType: # in bytes
|
|
"""Total installed GPU memory in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total installed GPU memory in bytes, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.total
|
|
"""
|
|
if self._memory_total is NA:
|
|
self._memory_total = self.memory_info().total
|
|
return self._memory_total
|
|
|
|
def memory_used(self) -> int | NaType: # in bytes
|
|
"""Total memory allocated by active contexts in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total memory allocated by active contexts in bytes, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.used
|
|
"""
|
|
return self.memory_info().used
|
|
|
|
def memory_free(self) -> int | NaType: # in bytes
|
|
"""Total free memory in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total free memory in bytes, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.free
|
|
"""
|
|
return self.memory_info().free
|
|
|
|
def memory_total_human(self) -> str | NaType: # in human readable
|
|
"""Total installed GPU memory in human readable format.
|
|
|
|
Returns: Union[str, NaType]
|
|
Total installed GPU memory in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
if self._memory_total_human is NA:
|
|
self._memory_total_human = bytes2human(self.memory_total())
|
|
return self._memory_total_human
|
|
|
|
def memory_used_human(self) -> str | NaType: # in human readable
|
|
"""Total memory allocated by active contexts in human readable format.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total memory allocated by active contexts in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
return bytes2human(self.memory_used())
|
|
|
|
def memory_free_human(self) -> str | NaType: # in human readable
|
|
"""Total free memory in human readable format.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total free memory in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return bytes2human(self.memory_free())
|
|
|
|
def memory_percent(self) -> float | NaType: # in percentage
|
|
"""The percentage of used memory over total memory (``0 <= p <= 100``).
|
|
|
|
Returns: Union[float, NaType]
|
|
The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
memory_info = self.memory_info()
|
|
if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn(
|
|
memory_info.total,
|
|
int,
|
|
):
|
|
return round(100.0 * memory_info.used / memory_info.total, 1)
|
|
return NA
|
|
|
|
def memory_usage(self) -> str: # string of used memory over total memory (in human readable)
|
|
"""The used memory over total memory in human readable format.
|
|
|
|
Returns: str
|
|
The used memory over total memory in human readable format, or :const:`'N/A / N/A'` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
return f'{self.memory_used_human()} / {self.memory_total_human()}'
|
|
|
|
@memoize_when_activated
|
|
def bar1_memory_info(self) -> MemoryInfo: # in bytes
|
|
"""Return a named tuple with BAR1 memory information (in bytes) for the device.
|
|
|
|
Returns: MemoryInfo(total, free, used)
|
|
A named tuple with BAR1 memory information, the item could be :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
memory_info = libnvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self.handle)
|
|
if libnvml.nvmlCheckReturn(memory_info):
|
|
return MemoryInfo(
|
|
total=memory_info.bar1Total,
|
|
free=memory_info.bar1Free,
|
|
used=memory_info.bar1Used,
|
|
)
|
|
return MemoryInfo(total=NA, free=NA, used=NA)
|
|
|
|
def bar1_memory_total(self) -> int | NaType: # in bytes
|
|
"""Total BAR1 memory in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return self.bar1_memory_info().total
|
|
|
|
def bar1_memory_used(self) -> int | NaType: # in bytes
|
|
"""Total used BAR1 memory in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total used BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return self.bar1_memory_info().used
|
|
|
|
def bar1_memory_free(self) -> int | NaType: # in bytes
|
|
"""Total free BAR1 memory in bytes.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total free BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return self.bar1_memory_info().free
|
|
|
|
def bar1_memory_total_human(self) -> str | NaType: # in human readable
|
|
"""Total BAR1 memory in human readable format.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return bytes2human(self.bar1_memory_total())
|
|
|
|
def bar1_memory_used_human(self) -> str | NaType: # in human readable
|
|
"""Total used BAR1 memory in human readable format.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total used BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return bytes2human(self.bar1_memory_used())
|
|
|
|
def bar1_memory_free_human(self) -> str | NaType: # in human readable
|
|
"""Total free BAR1 memory in human readable format.
|
|
|
|
Returns: Union[int, NaType]
|
|
Total free BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return bytes2human(self.bar1_memory_free())
|
|
|
|
def bar1_memory_percent(self) -> float | NaType: # in percentage
|
|
"""The percentage of used BAR1 memory over total BAR1 memory (0 <= p <= 100).
|
|
|
|
Returns: Union[float, NaType]
|
|
The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
memory_info = self.bar1_memory_info()
|
|
if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn(
|
|
memory_info.total,
|
|
int,
|
|
):
|
|
return round(100.0 * memory_info.used / memory_info.total, 1)
|
|
return NA
|
|
|
|
def bar1_memory_usage(self) -> str: # in human readable
|
|
"""The used BAR1 memory over total BAR1 memory in human readable format.
|
|
|
|
Returns: str
|
|
The used BAR1 memory over total BAR1 memory in human readable format, or :const:`'N/A / N/A'` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
return f'{self.bar1_memory_used_human()} / {self.bar1_memory_total_human()}'
|
|
|
|
@memoize_when_activated
|
|
def utilization_rates(self) -> UtilizationRates: # in percentage
|
|
"""Return a named tuple with GPU utilization rates (in percentage) for the device.
|
|
|
|
Returns: UtilizationRates(gpu, memory, encoder, decoder)
|
|
A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
gpu, memory, encoder, decoder = NA, NA, NA, NA
|
|
|
|
utilization_rates = libnvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self.handle)
|
|
if libnvml.nvmlCheckReturn(utilization_rates):
|
|
gpu, memory = utilization_rates.gpu, utilization_rates.memory
|
|
|
|
encoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetEncoderUtilization', self.handle)
|
|
if libnvml.nvmlCheckReturn(encoder_utilization, list) and len(encoder_utilization) > 0:
|
|
encoder = encoder_utilization[0]
|
|
|
|
decoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetDecoderUtilization', self.handle)
|
|
if libnvml.nvmlCheckReturn(decoder_utilization, list) and len(decoder_utilization) > 0:
|
|
decoder = decoder_utilization[0]
|
|
|
|
return UtilizationRates(gpu=gpu, memory=memory, encoder=encoder, decoder=decoder)
|
|
|
|
def gpu_utilization(self) -> int | NaType: # in percentage
|
|
"""Percent of time over the past sample period during which one or more kernels was executing on the GPU.
|
|
|
|
The sample period may be between 1 second and 1/6 second depending on the product.
|
|
|
|
Returns: Union[int, NaType]
|
|
The GPU utilization rate in percentage, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=utilization.gpu
|
|
"""
|
|
return self.utilization_rates().gpu
|
|
|
|
gpu_percent = gpu_utilization # in percentage
|
|
|
|
def memory_utilization(self) -> float | NaType: # in percentage
|
|
"""Percent of time over the past sample period during which global (device) memory was being read or written.
|
|
|
|
The sample period may be between 1 second and 1/6 second depending on the product.
|
|
|
|
Returns: Union[int, NaType]
|
|
The memory bandwidth utilization rate of the GPU in percentage, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=utilization.memory
|
|
""" # pylint: disable=line-too-long
|
|
return self.utilization_rates().memory
|
|
|
|
def encoder_utilization(self) -> float | NaType: # in percentage
|
|
"""The encoder utilization rate in percentage.
|
|
|
|
Returns: Union[int, NaType]
|
|
The encoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return self.utilization_rates().encoder
|
|
|
|
def decoder_utilization(self) -> float | NaType: # in percentage\
|
|
"""The decoder utilization rate in percentage.
|
|
|
|
Returns: Union[int, NaType]
|
|
The decoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
return self.utilization_rates().decoder
|
|
|
|
@memoize_when_activated
|
|
def clock_infos(self) -> ClockInfos: # in MHz
|
|
"""Return a named tuple with current clock speeds (in MHz) for the device.
|
|
|
|
Returns: ClockInfos(graphics, sm, memory, video)
|
|
A named tuple with current clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
return ClockInfos(
|
|
graphics=libnvml.nvmlQuery(
|
|
'nvmlDeviceGetClockInfo',
|
|
self.handle,
|
|
libnvml.NVML_CLOCK_GRAPHICS,
|
|
),
|
|
sm=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_SM),
|
|
memory=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_MEM),
|
|
video=libnvml.nvmlQuery(
|
|
'nvmlDeviceGetClockInfo',
|
|
self.handle,
|
|
libnvml.NVML_CLOCK_VIDEO,
|
|
),
|
|
)
|
|
|
|
clocks = clock_infos
|
|
|
|
@memoize_when_activated
|
|
def max_clock_infos(self) -> ClockInfos: # in MHz
|
|
"""Return a named tuple with maximum clock speeds (in MHz) for the device.
|
|
|
|
Returns: ClockInfos(graphics, sm, memory, video)
|
|
A named tuple with maximum clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
clock_infos = self._max_clock_infos._asdict()
|
|
for name, clock in clock_infos.items():
|
|
if clock is NA:
|
|
clock_type = getattr(
|
|
libnvml,
|
|
'NVML_CLOCK_{}'.format(name.replace('memory', 'mem').upper()),
|
|
)
|
|
clock = libnvml.nvmlQuery('nvmlDeviceGetMaxClockInfo', self.handle, clock_type)
|
|
clock_infos[name] = clock
|
|
self._max_clock_infos = ClockInfos(**clock_infos)
|
|
return self._max_clock_infos
|
|
|
|
max_clocks = max_clock_infos
|
|
|
|
def clock_speed_infos(self) -> ClockSpeedInfos: # in MHz
|
|
"""Return a named tuple with the current and the maximum clock speeds (in MHz) for the device.
|
|
|
|
Returns: ClockSpeedInfos(current, max)
|
|
A named tuple with the current and the maximum clock speeds (in MHz) for the device.
|
|
"""
|
|
return ClockSpeedInfos(current=self.clock_infos(), max=self.max_clock_infos())
|
|
|
|
def graphics_clock(self) -> int | NaType: # in MHz
|
|
"""Current frequency of graphics (shader) clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The current frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.graphics
|
|
""" # pylint: disable=line-too-long
|
|
return self.clock_infos().graphics
|
|
|
|
def sm_clock(self) -> int | NaType: # in MHz
|
|
"""Current frequency of SM (Streaming Multiprocessor) clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The current frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.sm
|
|
""" # pylint: disable=line-too-long
|
|
return self.clock_infos().sm
|
|
|
|
def memory_clock(self) -> int | NaType: # in MHz
|
|
"""Current frequency of memory clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The current frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.memory
|
|
"""
|
|
return self.clock_infos().memory
|
|
|
|
def video_clock(self) -> int | NaType: # in MHz
|
|
"""Current frequency of video encoder/decoder clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The current frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.video
|
|
""" # pylint: disable=line-too-long
|
|
return self.clock_infos().video
|
|
|
|
def max_graphics_clock(self) -> int | NaType: # in MHz
|
|
"""Maximum frequency of graphics (shader) clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The maximum frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.graphics
|
|
""" # pylint: disable=line-too-long
|
|
return self.max_clock_infos().graphics
|
|
|
|
def max_sm_clock(self) -> int | NaType: # in MHz
|
|
"""Maximum frequency of SM (Streaming Multiprocessor) clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The maximum frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.sm
|
|
""" # pylint: disable=line-too-long
|
|
return self.max_clock_infos().sm
|
|
|
|
def max_memory_clock(self) -> int | NaType: # in MHz
|
|
"""Maximum frequency of memory clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The maximum frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.memory
|
|
"""
|
|
return self.max_clock_infos().memory
|
|
|
|
def max_video_clock(self) -> int | NaType: # in MHz
|
|
"""Maximum frequency of video encoder/decoder clock in MHz.
|
|
|
|
Returns: Union[int, NaType]
|
|
The maximum frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.video
|
|
""" # pylint: disable=line-too-long
|
|
return self.max_clock_infos().video
|
|
|
|
def fan_speed(self) -> int | NaType: # in percentage
|
|
"""The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at.
|
|
|
|
This value may exceed 100% in certain cases. Note: The reported speed is the intended fan
|
|
speed. If the fan is physically blocked and unable to spin, this output will not match the
|
|
actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans
|
|
in the surrounding enclosure.
|
|
|
|
Returns: Union[int, NaType]
|
|
The fan speed value in percentage, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=fan.speed
|
|
""" # pylint: disable=line-too-long
|
|
return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self.handle)
|
|
|
|
def temperature(self) -> int | NaType: # in Celsius
|
|
"""Core GPU temperature in degrees C.
|
|
|
|
Returns: Union[int, NaType]
|
|
The core GPU temperature in Celsius degrees, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=temperature.gpu
|
|
"""
|
|
return libnvml.nvmlQuery(
|
|
'nvmlDeviceGetTemperature',
|
|
self.handle,
|
|
libnvml.NVML_TEMPERATURE_GPU,
|
|
)
|
|
|
|
@memoize_when_activated
|
|
def power_usage(self) -> int | NaType: # in milliwatts (mW)
|
|
"""The last measured power draw for the entire board in milliwatts.
|
|
|
|
Returns: Union[int, NaType]
|
|
The power draw for the entire board in milliwatts, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 ))
|
|
"""
|
|
return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self.handle)
|
|
|
|
power_draw = power_usage # in milliwatts (mW)
|
|
|
|
@memoize_when_activated
|
|
def power_limit(self) -> int | NaType: # in milliwatts (mW)
|
|
"""The software power limit in milliwatts.
|
|
|
|
Set by software like nvidia-smi.
|
|
|
|
Returns: Union[int, NaType]
|
|
The software power limit in milliwatts, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
$(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 ))
|
|
"""
|
|
return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self.handle)
|
|
|
|
def power_status(self) -> str: # string of power usage over power limit in watts (W)
|
|
"""The string of power usage over power limit in watts.
|
|
|
|
Returns: str
|
|
The string of power usage over power limit in watts, or :const:`'N/A / N/A'` when not applicable.
|
|
""" # pylint: disable=line-too-long
|
|
power_usage = self.power_usage()
|
|
power_limit = self.power_limit()
|
|
if libnvml.nvmlCheckReturn(power_usage, int):
|
|
power_usage = f'{round(power_usage / 1000.0)}W'
|
|
if libnvml.nvmlCheckReturn(power_limit, int):
|
|
power_limit = f'{round(power_limit / 1000.0)}W'
|
|
return f'{power_usage} / {power_limit}'
|
|
|
|
def display_active(self) -> str | NaType:
|
|
"""A flag that indicates whether a display is initialized on the GPU's (e.g. memory is allocated on the device for display).
|
|
|
|
Display can be active even when no monitor is physically attached. "Enabled" indicates an
|
|
active display. "Disabled" indicates otherwise.
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'Disabled'`: if not an active display device.
|
|
- :const:`'Enabled'`: if an active display device.
|
|
- :const:`nvitop.NA`: if not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_active
|
|
""" # pylint: disable=line-too-long
|
|
return {0: 'Disabled', 1: 'Enabled'}.get(
|
|
libnvml.nvmlQuery('nvmlDeviceGetDisplayActive', self.handle),
|
|
NA,
|
|
)
|
|
|
|
def display_mode(self) -> str | NaType:
|
|
"""A flag that indicates whether a physical display (e.g. monitor) is currently connected to any of the GPU's connectors.
|
|
|
|
"Enabled" indicates an attached display. "Disabled" indicates otherwise.
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'Disabled'`: if the display mode is disabled.
|
|
- :const:`'Enabled'`: if the display mode is enabled.
|
|
- :const:`nvitop.NA`: if not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_mode
|
|
""" # pylint: disable=line-too-long
|
|
return {0: 'Disabled', 1: 'Enabled'}.get(
|
|
libnvml.nvmlQuery('nvmlDeviceGetDisplayMode', self.handle),
|
|
NA,
|
|
)
|
|
|
|
def current_driver_model(self) -> str | NaType:
|
|
"""The driver model currently in use.
|
|
|
|
Always "N/A" on Linux. On Windows, the TCC (WDM) and WDDM driver models are supported. The
|
|
TCC driver model is optimized for compute applications. I.E. kernel launch times will be
|
|
quicker with TCC. The WDDM driver model is designed for graphics applications and is not
|
|
recommended for compute applications. Linux does not support multiple driver models, and
|
|
will always have the value of "N/A".
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'WDDM'`: for WDDM driver model on Windows.
|
|
- :const:`'WDM'`: for TTC (WDM) driver model on Windows.
|
|
- :const:`nvitop.NA`: if not applicable, e.g. on Linux.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=driver_model.current
|
|
"""
|
|
return {libnvml.NVML_DRIVER_WDDM: 'WDDM', libnvml.NVML_DRIVER_WDM: 'WDM'}.get(
|
|
libnvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self.handle),
|
|
NA,
|
|
)
|
|
|
|
driver_model = current_driver_model
|
|
|
|
def persistence_mode(self) -> str | NaType:
|
|
"""A flag that indicates whether persistence mode is enabled for the GPU. Value is either "Enabled" or "Disabled".
|
|
|
|
When persistence mode is enabled the NVIDIA driver remains loaded even when no active
|
|
clients, such as X11 or nvidia-smi, exist. This minimizes the driver load latency associated
|
|
with running dependent apps, such as CUDA programs. Linux only.
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'Disabled'`: if the persistence mode is disabled.
|
|
- :const:`'Enabled'`: if the persistence mode is enabled.
|
|
- :const:`nvitop.NA`: if not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=persistence_mode
|
|
""" # pylint: disable=line-too-long
|
|
return {0: 'Disabled', 1: 'Enabled'}.get(
|
|
libnvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self.handle),
|
|
NA,
|
|
)
|
|
|
|
def performance_state(self) -> str | NaType:
|
|
"""The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).
|
|
|
|
Returns: Union[str, NaType]
|
|
The current performance state in format ``P<int>``, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pstate
|
|
""" # pylint: disable=line-too-long
|
|
performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self.handle)
|
|
if libnvml.nvmlCheckReturn(performance_state, int):
|
|
performance_state = 'P' + str(performance_state)
|
|
return performance_state
|
|
|
|
def total_volatile_uncorrected_ecc_errors(self) -> int | NaType:
|
|
"""Total errors detected across entire chip.
|
|
|
|
Returns: Union[int, NaType]
|
|
The total number of uncorrected errors in volatile ECC memory, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=ecc.errors.uncorrected.volatile.total
|
|
""" # pylint: disable=line-too-long
|
|
return libnvml.nvmlQuery(
|
|
'nvmlDeviceGetTotalEccErrors',
|
|
self.handle,
|
|
libnvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
libnvml.NVML_VOLATILE_ECC,
|
|
)
|
|
|
|
def compute_mode(self) -> str | NaType:
|
|
"""The compute mode flag indicates whether individual or multiple compute applications may run on the GPU.
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'Default'`: means multiple contexts are allowed per device.
|
|
- :const:`'Exclusive Thread'`: deprecated, use Exclusive Process instead
|
|
- :const:`'Prohibited'`: means no contexts are allowed per device (no compute apps).
|
|
- :const:`'Exclusive Process'`: means only one context is allowed per device, usable from multiple threads at a time.
|
|
- :const:`nvitop.NA`: if not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_mode
|
|
""" # pylint: disable=line-too-long
|
|
return {
|
|
libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default',
|
|
libnvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread',
|
|
libnvml.NVML_COMPUTEMODE_PROHIBITED: 'Prohibited',
|
|
libnvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 'Exclusive Process',
|
|
}.get(libnvml.nvmlQuery('nvmlDeviceGetComputeMode', self.handle), NA)
|
|
|
|
def cuda_compute_capability(self) -> tuple[int, int] | NaType:
|
|
"""The CUDA compute capability for the device.
|
|
|
|
Returns: Union[Tuple[int, int], NaType]
|
|
The CUDA compute capability version in format ``(major, minor)``, or :const:`nvitop.NA` when not applicable.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_cap
|
|
"""
|
|
if self._cuda_compute_capability is None:
|
|
self._cuda_compute_capability = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetCudaComputeCapability',
|
|
self.handle,
|
|
)
|
|
return self._cuda_compute_capability
|
|
|
|
def is_mig_device(self) -> bool:
|
|
"""Return whether or not the device is a MIG device."""
|
|
if self._is_mig_device is None:
|
|
is_mig_device = libnvml.nvmlQuery(
|
|
'nvmlDeviceIsMigDeviceHandle',
|
|
self.handle,
|
|
default=False,
|
|
ignore_function_not_found=True,
|
|
)
|
|
self._is_mig_device = bool(is_mig_device) # nvmlDeviceIsMigDeviceHandle returns c_uint
|
|
return self._is_mig_device
|
|
|
|
def mig_mode(self) -> str | NaType:
|
|
"""The MIG mode that the GPU is currently operating under.
|
|
|
|
Returns: Union[str, NaType]
|
|
- :const:`'Disabled'`: if the MIG mode is disabled.
|
|
- :const:`'Enabled'`: if the MIG mode is enabled.
|
|
- :const:`nvitop.NA`: if not applicable, e.g. the GPU does not support MIG mode.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=mig.mode.current
|
|
"""
|
|
if self.is_mig_device():
|
|
return NA
|
|
|
|
mig_mode, *_ = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetMigMode',
|
|
self.handle,
|
|
default=(NA, NA),
|
|
ignore_function_not_found=True,
|
|
)
|
|
return {0: 'Disabled', 1: 'Enabled'}.get(mig_mode, NA)
|
|
|
|
def is_mig_mode_enabled(self) -> bool:
|
|
"""Test whether the MIG mode is enabled on the device.
|
|
|
|
Return :data:`False` if MIG mode is disabled or the device does not support MIG mode.
|
|
"""
|
|
return boolify(self.mig_mode())
|
|
|
|
def max_mig_device_count(self) -> int:
|
|
"""Return the maximum number of MIG instances the device supports.
|
|
|
|
This method will return 0 if the device does not support MIG mode.
|
|
"""
|
|
return 0 # implemented in PhysicalDevice
|
|
|
|
def mig_devices(self) -> list[MigDevice]:
|
|
"""Return a list of children MIG devices of the current device.
|
|
|
|
This method will return an empty list if the MIG mode is disabled or the device does not
|
|
support MIG mode.
|
|
"""
|
|
return [] # implemented in PhysicalDevice
|
|
|
|
def is_leaf_device(self) -> bool:
|
|
"""Test whether the device is a physical device with MIG mode disabled or a MIG device.
|
|
|
|
Return :data:`True` if the device is a physical device with MIG mode disabled or a MIG device.
|
|
Otherwise, return :data:`False` if the device is a physical device with MIG mode enabled.
|
|
"""
|
|
return self.is_mig_device() or not self.is_mig_mode_enabled()
|
|
|
|
def to_leaf_devices(self) -> list[PhysicalDevice | MigDevice | CudaDevice]:
|
|
"""Return a list of leaf devices.
|
|
|
|
Note that a CUDA device is always a leaf device.
|
|
"""
|
|
if isinstance(self, CudaDevice) or self.is_leaf_device():
|
|
return [self]
|
|
return self.mig_devices()
|
|
|
|
def processes(self) -> dict[int, GpuProcess]:
|
|
"""Return a dictionary of processes running on the GPU.
|
|
|
|
Returns: Dict[int, GpuProcess]
|
|
A dictionary mapping PID to GPU process instance.
|
|
"""
|
|
processes = {}
|
|
|
|
found_na = False
|
|
for type, func in ( # pylint: disable=redefined-builtin
|
|
('C', 'nvmlDeviceGetComputeRunningProcesses'),
|
|
('G', 'nvmlDeviceGetGraphicsRunningProcesses'),
|
|
):
|
|
for p in libnvml.nvmlQuery(func, self.handle, default=()):
|
|
if isinstance(p.usedGpuMemory, int):
|
|
gpu_memory = p.usedGpuMemory
|
|
else:
|
|
# Used GPU memory is `N/A` on Windows Display Driver Model (WDDM)
|
|
# or on MIG-enabled GPUs
|
|
gpu_memory = NA
|
|
found_na = True
|
|
proc = processes[p.pid] = self.GPU_PROCESS_CLASS(
|
|
pid=p.pid,
|
|
device=self,
|
|
gpu_memory=gpu_memory,
|
|
gpu_instance_id=getattr(p, 'gpuInstanceId', 0xFFFFFFFF),
|
|
compute_instance_id=getattr(p, 'computeInstanceId', 0xFFFFFFFF),
|
|
)
|
|
proc.type = proc.type + type
|
|
|
|
if len(processes) > 0:
|
|
samples = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetProcessUtilization',
|
|
self.handle,
|
|
self._timestamp,
|
|
default=(),
|
|
)
|
|
self._timestamp = max(min((s.timeStamp for s in samples), default=0) - 2_000_000, 0)
|
|
for s in samples:
|
|
try:
|
|
processes[s.pid].set_gpu_utilization(s.smUtil, s.memUtil, s.encUtil, s.decUtil)
|
|
except KeyError:
|
|
pass
|
|
if not found_na:
|
|
for pid in set(processes).difference(s.pid for s in samples):
|
|
processes[pid].set_gpu_utilization(0, 0, 0, 0)
|
|
|
|
return processes
|
|
|
|
def as_snapshot(self) -> Snapshot:
|
|
"""Return a onetime snapshot of the device.
|
|
|
|
The attributes are defined in :attr:`SNAPSHOT_KEYS`.
|
|
"""
|
|
with self.oneshot():
|
|
return Snapshot(
|
|
real=self,
|
|
index=self.index,
|
|
physical_index=self.physical_index,
|
|
**{key: getattr(self, key)() for key in self.SNAPSHOT_KEYS},
|
|
)
|
|
|
|
SNAPSHOT_KEYS = [
|
|
'name',
|
|
'uuid',
|
|
'bus_id',
|
|
'memory_info',
|
|
'memory_used',
|
|
'memory_free',
|
|
'memory_total',
|
|
'memory_used_human',
|
|
'memory_free_human',
|
|
'memory_total_human',
|
|
'memory_percent',
|
|
'memory_usage',
|
|
'utilization_rates',
|
|
'gpu_utilization',
|
|
'memory_utilization',
|
|
'encoder_utilization',
|
|
'decoder_utilization',
|
|
'clock_infos',
|
|
'max_clock_infos',
|
|
'clock_speed_infos',
|
|
'sm_clock',
|
|
'memory_clock',
|
|
'fan_speed',
|
|
'temperature',
|
|
'power_usage',
|
|
'power_limit',
|
|
'power_status',
|
|
'display_active',
|
|
'display_mode',
|
|
'current_driver_model',
|
|
'persistence_mode',
|
|
'performance_state',
|
|
'total_volatile_uncorrected_ecc_errors',
|
|
'compute_mode',
|
|
'cuda_compute_capability',
|
|
'mig_mode',
|
|
]
|
|
|
|
# Modified from psutil (https://github.com/giampaolo/psutil)
|
|
@contextlib.contextmanager
|
|
def oneshot(self) -> contextlib.AbstractContextManager:
|
|
"""A utility context manager which considerably speeds up the retrieval of multiple device information at the same time.
|
|
|
|
Internally different device info (e.g. memory_info, utilization_rates, ...) may be fetched
|
|
by using the same routine, but only one information is returned and the others are discarded.
|
|
When using this context manager the internal routine is executed once (in the example below
|
|
on memory_info()) and the other info are cached.
|
|
|
|
The cache is cleared when exiting the context manager block. The advice is to use this every
|
|
time you retrieve more than one information about the device.
|
|
|
|
Examples:
|
|
>>> from nvitop import Device
|
|
>>> device = Device(0)
|
|
>>> with device.oneshot():
|
|
... device.memory_info() # collect multiple info
|
|
... device.memory_used() # return cached value
|
|
... device.memory_free_human() # return cached value
|
|
... device.memory_percent() # return cached value
|
|
""" # pylint: disable=line-too-long
|
|
with self._lock:
|
|
# pylint: disable=no-member
|
|
if hasattr(self, '_cache'):
|
|
# NOOP: this covers the use case where the user enters the
|
|
# context twice:
|
|
#
|
|
# >>> with device.oneshot():
|
|
# ... with device.oneshot():
|
|
# ...
|
|
#
|
|
# Also, since as_snapshot() internally uses oneshot()
|
|
# I expect that the code below will be a pretty common
|
|
# "mistake" that the user will make, so let's guard
|
|
# against that:
|
|
#
|
|
# >>> with device.oneshot():
|
|
# ... device.as_snapshot()
|
|
# ...
|
|
yield
|
|
else:
|
|
try:
|
|
self.memory_info.cache_activate(self)
|
|
self.bar1_memory_info.cache_activate(self)
|
|
self.utilization_rates.cache_activate(self)
|
|
self.clock_infos.cache_activate(self)
|
|
self.max_clock_infos.cache_activate(self)
|
|
self.power_usage.cache_activate(self)
|
|
self.power_limit.cache_activate(self)
|
|
yield
|
|
finally:
|
|
self.memory_info.cache_deactivate(self)
|
|
self.bar1_memory_info.cache_deactivate(self)
|
|
self.utilization_rates.cache_deactivate(self)
|
|
self.clock_infos.cache_deactivate(self)
|
|
self.max_clock_infos.cache_deactivate(self)
|
|
self.power_usage.cache_deactivate(self)
|
|
self.power_limit.cache_deactivate(self)
|
|
|
|
|
|
class PhysicalDevice(Device):
|
|
"""Class for physical devices.
|
|
|
|
This is the real GPU installed in the system.
|
|
"""
|
|
|
|
@property
|
|
def physical_index(self) -> int:
|
|
"""Zero based index of the GPU. Can change at each boot.
|
|
|
|
Command line equivalent:
|
|
|
|
.. code:: bash
|
|
|
|
nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=index
|
|
"""
|
|
return self._nvml_index
|
|
|
|
def max_mig_device_count(self) -> int:
|
|
"""Return the maximum number of MIG instances the device supports.
|
|
|
|
This method will return 0 if the device does not support MIG mode.
|
|
"""
|
|
return libnvml.nvmlQuery(
|
|
'nvmlDeviceGetMaxMigDeviceCount',
|
|
self.handle,
|
|
default=0,
|
|
ignore_function_not_found=True,
|
|
)
|
|
|
|
def mig_device(self, mig_index: int) -> MigDevice:
|
|
"""Return a child MIG device of the given index.
|
|
|
|
Raises:
|
|
libnvml.NVMLError:
|
|
If the device does not support MIG mode or the given MIG device does not exist.
|
|
"""
|
|
with _global_physical_device(self):
|
|
return MigDevice(index=(self.index, mig_index))
|
|
|
|
def mig_devices(self) -> list[MigDevice]:
|
|
"""Return a list of children MIG devices of the current device.
|
|
|
|
This method will return an empty list if the MIG mode is disabled or the device does not
|
|
support MIG mode.
|
|
"""
|
|
mig_devices = []
|
|
|
|
if self.is_mig_mode_enabled():
|
|
max_mig_device_count = self.max_mig_device_count()
|
|
with _global_physical_device(self):
|
|
for mig_index in range(max_mig_device_count):
|
|
try:
|
|
mig_device = MigDevice(index=(self.index, mig_index))
|
|
except libnvml.NVMLError:
|
|
break
|
|
else:
|
|
mig_devices.append(mig_device)
|
|
|
|
return mig_devices
|
|
|
|
|
|
class MigDevice(Device): # pylint: disable=too-many-instance-attributes
|
|
"""Class for MIG devices."""
|
|
|
|
@classmethod
|
|
def count(cls) -> int:
|
|
"""The number of total MIG devices aggregated over all physical devices."""
|
|
return len(cls.all())
|
|
|
|
@classmethod
|
|
def all(cls) -> list[MigDevice]:
|
|
"""Return a list of MIG devices aggregated over all physical devices."""
|
|
mig_devices = []
|
|
for device in PhysicalDevice.all():
|
|
mig_devices.extend(device.mig_devices())
|
|
return mig_devices
|
|
|
|
@classmethod
|
|
def from_indices( # pylint: disable=signature-differs
|
|
cls,
|
|
indices: Iterable[tuple[int, int]],
|
|
) -> list[MigDevice]:
|
|
"""Return a list of MIG devices of the given indices.
|
|
|
|
Args:
|
|
indices (Iterable[Tuple[int, int]]):
|
|
Indices of the MIG devices. Each index is a tuple of two integers.
|
|
|
|
Returns: List[MigDevice]
|
|
A list of :class:`MigDevice` instances of the given indices.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
"""
|
|
return list(map(cls, indices))
|
|
|
|
# pylint: disable-next=super-init-not-called
|
|
def __init__(
|
|
self,
|
|
index: tuple[int, int] | str | None = None,
|
|
*,
|
|
uuid: str | None = None,
|
|
) -> None:
|
|
"""Initialize the instance created by :meth:`__new__()`.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
"""
|
|
if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID
|
|
index, uuid = None, index
|
|
|
|
index, uuid = (arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid))
|
|
|
|
self._name = NA
|
|
self._uuid = NA
|
|
self._bus_id = NA
|
|
self._memory_total = NA
|
|
self._memory_total_human = NA
|
|
self._gpu_instance_id = NA
|
|
self._compute_instance_id = NA
|
|
self._is_mig_device = True
|
|
self._cuda_index = None
|
|
self._cuda_compute_capability = None
|
|
|
|
if index is not None:
|
|
self._nvml_index = index
|
|
self._handle = None
|
|
|
|
parent = _get_global_physical_device()
|
|
if (
|
|
parent is None
|
|
or parent.handle is None
|
|
or parent.physical_index != self.physical_index
|
|
):
|
|
parent = PhysicalDevice(index=self.physical_index)
|
|
self._parent = parent
|
|
if self.parent.handle is not None:
|
|
try:
|
|
self._handle = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetMigDeviceHandleByIndex',
|
|
self.parent.handle,
|
|
self.mig_index,
|
|
ignore_errors=False,
|
|
)
|
|
except libnvml.NVMLError_GpuIsLost:
|
|
pass
|
|
else:
|
|
self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False)
|
|
parent_handle = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetDeviceHandleFromMigDeviceHandle',
|
|
self.handle,
|
|
ignore_errors=False,
|
|
)
|
|
parent_index = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetIndex',
|
|
parent_handle,
|
|
ignore_errors=False,
|
|
)
|
|
self._parent = PhysicalDevice(index=parent_index)
|
|
for mig_device in self.parent.mig_devices():
|
|
if self.uuid() == mig_device.uuid():
|
|
self._nvml_index = mig_device.index
|
|
break
|
|
else:
|
|
raise libnvml.NVMLError_NotFound
|
|
|
|
self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
|
|
self._timestamp = 0
|
|
self._lock = threading.RLock()
|
|
|
|
self._ident = (self.index, self.uuid())
|
|
self._hash = None
|
|
|
|
@property
|
|
def index(self) -> tuple[int, int]:
|
|
"""The index of the MIG device. This is a tuple of two integers."""
|
|
return self._nvml_index
|
|
|
|
@property
|
|
def physical_index(self) -> int:
|
|
"""The index of the parent physical device."""
|
|
return self._nvml_index[0]
|
|
|
|
@property
|
|
def mig_index(self) -> int:
|
|
"""The index of the MIG device over the all MIG devices of the parent device."""
|
|
return self._nvml_index[1]
|
|
|
|
@property
|
|
def parent(self) -> PhysicalDevice:
|
|
"""The parent physical device."""
|
|
return self._parent
|
|
|
|
def gpu_instance_id(self) -> int | NaType:
|
|
"""The gpu instance ID of the MIG device.
|
|
|
|
Returns: Union[int, NaType]
|
|
The gpu instance ID of the MIG device, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
if self._gpu_instance_id is NA:
|
|
self._gpu_instance_id = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetGpuInstanceId',
|
|
self.handle,
|
|
default=0xFFFFFFFF,
|
|
)
|
|
if self._gpu_instance_id == 0xFFFFFFFF:
|
|
self._gpu_instance_id = NA
|
|
return self._gpu_instance_id
|
|
|
|
def compute_instance_id(self) -> int | NaType:
|
|
"""The compute instance ID of the MIG device.
|
|
|
|
Returns: Union[int, NaType]
|
|
The compute instance ID of the MIG device, or :const:`nvitop.NA` when not applicable.
|
|
"""
|
|
if self._compute_instance_id is NA:
|
|
self._compute_instance_id = libnvml.nvmlQuery(
|
|
'nvmlDeviceGetComputeInstanceId',
|
|
self.handle,
|
|
default=0xFFFFFFFF,
|
|
)
|
|
if self._compute_instance_id == 0xFFFFFFFF:
|
|
self._compute_instance_id = NA
|
|
return self._compute_instance_id
|
|
|
|
def as_snapshot(self) -> Snapshot:
|
|
"""Return a onetime snapshot of the device.
|
|
|
|
The attributes are defined in :attr:`SNAPSHOT_KEYS`.
|
|
"""
|
|
snapshot = super().as_snapshot()
|
|
snapshot.mig_index = self.mig_index
|
|
|
|
return snapshot
|
|
|
|
SNAPSHOT_KEYS = [*Device.SNAPSHOT_KEYS, 'gpu_instance_id', 'compute_instance_id']
|
|
|
|
|
|
class CudaDevice(Device):
|
|
"""Class for devices enumerated over the CUDA ordinal.
|
|
|
|
The order can be vary for different ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
:meth:`CudaDevice.__new__()` returns different types depending on the given arguments.
|
|
|
|
.. code-block:: python
|
|
|
|
- (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (nvml_index: int) -> CudaDevice
|
|
- (nvml_index: (int, int)) -> CudaMigDevice
|
|
|
|
Examples:
|
|
>>> import os
|
|
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'
|
|
|
|
>>> CudaDevice.count() # number of NVIDIA GPUs visible to CUDA applications
|
|
4
|
|
>>> Device.cuda.count() # use alias in class `Device`
|
|
4
|
|
|
|
>>> CudaDevice.all() # all CUDA visible devices (or `Device.cuda.all()`)
|
|
[
|
|
CudaDevice(cuda_index=0, nvml_index=3, ...),
|
|
CudaDevice(cuda_index=1, nvml_index=2, ...),
|
|
...
|
|
]
|
|
|
|
>>> cuda0 = CudaDevice(cuda_index=0) # use CUDA ordinal (or `Device.cuda(0)`)
|
|
>>> cuda1 = CudaDevice(nvml_index=2) # use NVML ordinal
|
|
>>> cuda2 = CudaDevice(uuid='GPU-xxxxxx') # use UUID string
|
|
|
|
>>> cuda0.memory_free() # total free memory in bytes
|
|
11550654464
|
|
>>> cuda0.memory_free_human() # total free memory in human readable format
|
|
'11016MiB'
|
|
|
|
>>> cuda1.as_snapshot() # takes an onetime snapshot of the device
|
|
CudaDeviceSnapshot(
|
|
real=CudaDevice(cuda_index=1, nvml_index=2, ...),
|
|
...
|
|
)
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
libnvml.NVMLError_InvalidArgument:
|
|
If the NVML index is out of range.
|
|
TypeError:
|
|
If the number of non-None arguments is not exactly 1.
|
|
TypeError:
|
|
If the given NVML index is a tuple but is not consist of two integers.
|
|
RuntimeError:
|
|
If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
""" # pylint: disable=line-too-long
|
|
|
|
@classmethod
|
|
def is_available(cls) -> bool:
|
|
"""Test whether there are any CUDA-capable devices available."""
|
|
return cls.count() > 0
|
|
|
|
@classmethod
|
|
def count(cls) -> int:
|
|
"""The number of GPUs visible to CUDA applications."""
|
|
try:
|
|
return len(super().parse_cuda_visible_devices())
|
|
except libnvml.NVMLError:
|
|
return 0
|
|
|
|
@classmethod
|
|
def all(cls) -> list[CudaDevice]:
|
|
"""All CUDA visible devices.
|
|
|
|
Note:
|
|
The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
"""
|
|
return cls.from_indices()
|
|
|
|
@classmethod
|
|
def from_indices(
|
|
cls,
|
|
indices: int | Iterable[int] | None = None,
|
|
) -> list[CudaDevice]:
|
|
"""Return a list of CUDA devices of the given CUDA indices.
|
|
|
|
The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_indices (Iterable[int]):
|
|
The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices.
|
|
|
|
Returns: List[CudaDevice]
|
|
A list of :class:`CudaDevice` of the given CUDA indices.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
RuntimeError:
|
|
If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
"""
|
|
return super().from_cuda_indices(indices)
|
|
|
|
def __new__(
|
|
cls,
|
|
cuda_index: int | None = None,
|
|
*,
|
|
nvml_index: int | tuple[int, int] | None = None,
|
|
uuid: str | None = None,
|
|
) -> Device:
|
|
"""Create a new instance of CudaDevice.
|
|
|
|
The type of the result is determined by the given argument.
|
|
|
|
.. code-block:: python
|
|
|
|
- (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES`
|
|
- (nvml_index: int) -> CudaDevice
|
|
- (nvml_index: (int, int)) -> CudaMigDevice
|
|
|
|
Note: This method takes exact 1 non-None argument.
|
|
|
|
Returns: Union[CudaDevice, CudaMigDevice]
|
|
A :class:`CudaDevice` instance or a :class:`CudaMigDevice` instance.
|
|
|
|
Raises:
|
|
TypeError:
|
|
If the number of non-None arguments is not exactly 1.
|
|
TypeError:
|
|
If the given NVML index is a tuple but is not consist of two integers.
|
|
RuntimeError:
|
|
If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
"""
|
|
if cuda_index is not None and nvml_index is None and uuid is None:
|
|
cuda_visible_devices = cls.parse_cuda_visible_devices()
|
|
if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices):
|
|
raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.')
|
|
nvml_index = cuda_visible_devices[cuda_index]
|
|
|
|
if not isinstance(nvml_index, int) or is_mig_device_uuid(uuid):
|
|
return super().__new__(CudaMigDevice, index=nvml_index, uuid=uuid)
|
|
|
|
return super().__new__(cls, index=nvml_index, uuid=uuid)
|
|
|
|
def __init__(
|
|
self,
|
|
cuda_index: int | None = None,
|
|
*,
|
|
nvml_index: int | tuple[int, int] | None = None,
|
|
uuid: str | None = None,
|
|
) -> None:
|
|
"""Initialize the instance created by :meth:`__new__()`.
|
|
|
|
Raises:
|
|
libnvml.NVMLError_LibraryNotFound:
|
|
If cannot find the NVML library, usually the NVIDIA driver is not installed.
|
|
libnvml.NVMLError_DriverNotLoaded:
|
|
If NVIDIA driver is not loaded.
|
|
libnvml.NVMLError_LibRmVersionMismatch:
|
|
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
|
|
driver without reloading the kernel module.
|
|
libnvml.NVMLError_NotFound:
|
|
If the device is not found for the given NVML identifier.
|
|
libnvml.NVMLError_InvalidArgument:
|
|
If the NVML index is out of range.
|
|
RuntimeError:
|
|
If the given device is not visible to CUDA applications (i.e. not listed in the
|
|
``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid).
|
|
"""
|
|
if cuda_index is not None and nvml_index is None and uuid is None:
|
|
cuda_visible_devices = self.parse_cuda_visible_devices()
|
|
if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices):
|
|
raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.')
|
|
nvml_index = cuda_visible_devices[cuda_index]
|
|
|
|
super().__init__(index=nvml_index, uuid=uuid)
|
|
|
|
if cuda_index is None:
|
|
cuda_index = super().cuda_index
|
|
self._cuda_index = cuda_index
|
|
|
|
self._ident = ((self._cuda_index, self.index), self.uuid())
|
|
|
|
def __repr__(self) -> str:
|
|
"""Return a string representation of the CUDA device."""
|
|
return '{}(cuda_index={}, nvml_index={}, name="{}", total_memory={})'.format(
|
|
self.__class__.__name__,
|
|
self.cuda_index,
|
|
self.index,
|
|
self.name(),
|
|
self.memory_total_human(),
|
|
)
|
|
|
|
def __reduce__(self) -> tuple[type[CudaDevice], tuple[int]]:
|
|
"""Return state information for pickling."""
|
|
return self.__class__, (self._cuda_index,)
|
|
|
|
def as_snapshot(self) -> Snapshot:
|
|
"""Return a onetime snapshot of the device.
|
|
|
|
The attributes are defined in :attr:`SNAPSHOT_KEYS`.
|
|
"""
|
|
snapshot = super().as_snapshot()
|
|
snapshot.cuda_index = self.cuda_index
|
|
|
|
return snapshot
|
|
|
|
|
|
Device.cuda = CudaDevice
|
|
"""Shortcut for class :class:`CudaDevice`."""
|
|
|
|
|
|
class CudaMigDevice(CudaDevice, MigDevice):
|
|
"""Class for CUDA devices that are MIG devices."""
|
|
|
|
|
|
def is_mig_device_uuid(uuid: str | None) -> bool:
|
|
"""Return :data:`True` if the argument is a MIG device UUID, otherwise, return :data:`False`."""
|
|
if isinstance(uuid, str):
|
|
match = Device.UUID_PATTERN.match(uuid)
|
|
if match is not None and match.group('MigMode') is not None:
|
|
return True
|
|
return False
|
|
|
|
|
|
def parse_cuda_visible_devices(
|
|
cuda_visible_devices: str | None = _VALUE_OMITTED,
|
|
) -> list[int] | list[tuple[int, int]]:
|
|
"""Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices.
|
|
|
|
This function is aliased by :meth:`Device.parse_cuda_visible_devices`.
|
|
|
|
Note:
|
|
The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_visible_devices (Optional[str]):
|
|
The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the
|
|
environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES``
|
|
environment variable will be unset before parsing.
|
|
|
|
Returns: Union[List[int], List[Tuple[int, int]]]
|
|
A list of int (physical device) or a list of tuple of two integers (MIG device) for the
|
|
corresponding real device indices.
|
|
|
|
Examples:
|
|
>>> import os
|
|
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5'
|
|
>>> parse_cuda_visible_devices() # parse the `CUDA_VISIBLE_DEVICES` environment variable to NVML indices
|
|
[6, 5]
|
|
|
|
>>> parse_cuda_visible_devices('0,4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly
|
|
[0, 4]
|
|
|
|
>>> parse_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # accept abbreviated UUIDs
|
|
[5, 6]
|
|
|
|
>>> parse_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset
|
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
|
|
>>> parse_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID)
|
|
[(0, 0)]
|
|
>>> parse_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID)
|
|
[(0, 1)]
|
|
>>> parse_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID)
|
|
[(0, 1)]
|
|
|
|
>>> parse_cuda_visible_devices('') # empty string
|
|
[]
|
|
>>> parse_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal)
|
|
[]
|
|
>>> parse_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range)
|
|
[]
|
|
""" # pylint: disable=line-too-long
|
|
if cuda_visible_devices is _VALUE_OMITTED:
|
|
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None)
|
|
|
|
return _parse_cuda_visible_devices(cuda_visible_devices, format='index')
|
|
|
|
|
|
def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str:
|
|
"""Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs.
|
|
|
|
This function is aliased by :meth:`Device.normalize_cuda_visible_devices`.
|
|
|
|
Note:
|
|
The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid.
|
|
|
|
See also for CUDA Device Enumeration:
|
|
- `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_
|
|
- `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_
|
|
|
|
Args:
|
|
cuda_visible_devices (Optional[str]):
|
|
The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the
|
|
environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES``
|
|
environment variable will be unset before parsing.
|
|
|
|
Returns: str
|
|
The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
|
|
Examples:
|
|
>>> import os
|
|
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5'
|
|
>>> normalize_cuda_visible_devices() # normalize the `CUDA_VISIBLE_DEVICES` environment variable to UUID strings
|
|
'GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1'
|
|
|
|
>>> normalize_cuda_visible_devices('4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly
|
|
'GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0'
|
|
|
|
>>> normalize_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # normalize abbreviated UUIDs
|
|
'GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794'
|
|
|
|
>>> normalize_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset
|
|
'GPU-<GPU0-UUID>,GPU-<GPU1-UUID>,...' # all GPU UUIDs
|
|
|
|
>>> normalize_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID)
|
|
'MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd'
|
|
>>> normalize_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID)
|
|
'MIG-37b51284-1df4-5451-979d-3231ccb0822e'
|
|
>>> normalize_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID)
|
|
'MIG-37b51284-1df4-5451-979d-3231ccb0822e'
|
|
|
|
>>> normalize_cuda_visible_devices('') # empty string
|
|
''
|
|
>>> normalize_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal)
|
|
''
|
|
>>> normalize_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range)
|
|
''
|
|
""" # pylint: disable=line-too-long
|
|
if cuda_visible_devices is _VALUE_OMITTED:
|
|
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None)
|
|
|
|
return ','.join(_parse_cuda_visible_devices(cuda_visible_devices, format='uuid'))
|
|
|
|
|
|
# Helper functions #################################################################################
|
|
|
|
|
|
class _PhysicalDeviceAttrs(NamedTuple):
|
|
index: int
|
|
name: str
|
|
uuid: str
|
|
support_mig_mode: bool
|
|
|
|
|
|
_PHYSICAL_DEVICE_ATTRS = None
|
|
_GLOBAL_PHYSICAL_DEVICE = None
|
|
_GLOBAL_PHYSICAL_DEVICE_LOCK = threading.RLock()
|
|
|
|
|
|
def _get_all_physical_device_attrs() -> dict[str, _PhysicalDeviceAttrs]:
|
|
global _PHYSICAL_DEVICE_ATTRS # pylint: disable=global-statement
|
|
|
|
with _GLOBAL_PHYSICAL_DEVICE_LOCK:
|
|
if _PHYSICAL_DEVICE_ATTRS is None:
|
|
_PHYSICAL_DEVICE_ATTRS = OrderedDict(
|
|
[
|
|
(
|
|
device.uuid(),
|
|
_PhysicalDeviceAttrs(
|
|
device.index,
|
|
device.name(),
|
|
device.uuid(),
|
|
libnvml.nvmlCheckReturn(device.mig_mode()),
|
|
),
|
|
)
|
|
for device in PhysicalDevice.all()
|
|
],
|
|
)
|
|
return _PHYSICAL_DEVICE_ATTRS
|
|
|
|
|
|
def _does_any_device_support_mig_mode(uuids: Iterable[str] | None = None) -> bool:
|
|
physical_device_attrs = _get_all_physical_device_attrs()
|
|
uuids = uuids or physical_device_attrs.keys()
|
|
return any(physical_device_attrs[uuid].support_mig_mode for uuid in uuids)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def _global_physical_device(device: PhysicalDevice) -> PhysicalDevice:
|
|
global _GLOBAL_PHYSICAL_DEVICE # pylint: disable=global-statement
|
|
|
|
with _GLOBAL_PHYSICAL_DEVICE_LOCK:
|
|
try:
|
|
_GLOBAL_PHYSICAL_DEVICE = device
|
|
yield _GLOBAL_PHYSICAL_DEVICE
|
|
finally:
|
|
_GLOBAL_PHYSICAL_DEVICE = None
|
|
|
|
|
|
def _get_global_physical_device() -> PhysicalDevice:
|
|
with _GLOBAL_PHYSICAL_DEVICE_LOCK:
|
|
return _GLOBAL_PHYSICAL_DEVICE
|
|
|
|
|
|
@functools.lru_cache()
|
|
def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-statements
|
|
cuda_visible_devices: str | None = None,
|
|
format: str = 'index', # pylint: disable=redefined-builtin
|
|
) -> list[int] | list[tuple[int, int]] | list[str]:
|
|
"""The underlining implementation for :meth:`parse_cuda_visible_devices`. The result will be cached."""
|
|
assert format in ('index', 'uuid')
|
|
|
|
try:
|
|
physical_device_attrs = _get_all_physical_device_attrs()
|
|
except libnvml.NVMLError:
|
|
return []
|
|
gpu_uuids = set(physical_device_attrs)
|
|
|
|
try:
|
|
raw_uuids = _parse_cuda_visible_devices_to_uuids(cuda_visible_devices, verbose=False)
|
|
except libcuda.CUDAError:
|
|
pass
|
|
else:
|
|
uuids = [
|
|
uuid if uuid in gpu_uuids else uuid.replace('GPU', 'MIG', 1)
|
|
for uuid in map('GPU-{}'.format, raw_uuids)
|
|
]
|
|
if gpu_uuids.issuperset(uuids) and not _does_any_device_support_mig_mode(uuids):
|
|
if format == 'uuid':
|
|
return uuids
|
|
return [physical_device_attrs[uuid].index for uuid in uuids]
|
|
cuda_visible_devices = ','.join(uuids)
|
|
|
|
if cuda_visible_devices is None:
|
|
cuda_visible_devices = ','.join(physical_device_attrs.keys())
|
|
|
|
def from_index_or_uuid(index_or_uuid: int | str) -> Device:
|
|
nonlocal use_integer_identifiers
|
|
|
|
if isinstance(index_or_uuid, str):
|
|
if index_or_uuid.isdigit():
|
|
index_or_uuid = int(index_or_uuid)
|
|
elif Device.UUID_PATTERN.match(index_or_uuid) is None:
|
|
raise libnvml.NVMLError_NotFound
|
|
|
|
if use_integer_identifiers is None:
|
|
use_integer_identifiers = isinstance(index_or_uuid, int)
|
|
|
|
if isinstance(index_or_uuid, int) and use_integer_identifiers:
|
|
return Device(index=index_or_uuid)
|
|
if isinstance(index_or_uuid, str) and not use_integer_identifiers:
|
|
return Device(uuid=index_or_uuid)
|
|
raise ValueError('invalid identifier')
|
|
|
|
def strip_identifier(identifier: str) -> str:
|
|
identifier = identifier.strip()
|
|
if len(identifier) > 0 and (
|
|
identifier[0].isdigit()
|
|
or (len(identifier) > 1 and identifier[0] in ('+', '-') and identifier[1].isdigit())
|
|
):
|
|
offset = 1 if identifier[0] in ('+', '-') else 0
|
|
while offset < len(identifier) and identifier[offset].isdigit():
|
|
offset += 1
|
|
identifier = identifier[:offset]
|
|
return identifier
|
|
|
|
devices = []
|
|
presented = set()
|
|
use_integer_identifiers = None
|
|
for identifier in map(strip_identifier, cuda_visible_devices.split(',')):
|
|
if identifier in presented:
|
|
return [] # duplicate identifiers found
|
|
|
|
try:
|
|
device = from_index_or_uuid(identifier)
|
|
except (ValueError, libnvml.NVMLError):
|
|
break
|
|
|
|
devices.append(device)
|
|
presented.add(identifier)
|
|
|
|
mig_devices = [device for device in devices if device.is_mig_device()]
|
|
if len(mig_devices) > 0:
|
|
# Got MIG devices enumerated, use the first one
|
|
devices = mig_devices[:1] # at most one MIG device is visible
|
|
else:
|
|
# All devices in `CUDA_VISIBLE_DEVICES` are physical devices
|
|
# Check if any GPU that enables MIG mode
|
|
devices_backup = devices.copy()
|
|
devices = []
|
|
for device in devices_backup:
|
|
if device.is_mig_mode_enabled():
|
|
# Got available MIG devices, use the first MIG device and ignore all non-MIG GPUs
|
|
try:
|
|
devices = [device.mig_device(mig_index=0)] # at most one MIG device is visible
|
|
except libnvml.NVMLError:
|
|
continue # no MIG device available on the GPU
|
|
else:
|
|
break # got one MIG device
|
|
else:
|
|
devices.append(device) # non-MIG device
|
|
|
|
if format == 'uuid':
|
|
return [device.uuid() for device in devices]
|
|
return [device.index for device in devices]
|
|
|
|
|
|
def _parse_cuda_visible_devices_to_uuids(
|
|
cuda_visible_devices: str | None = _VALUE_OMITTED,
|
|
verbose: bool = True,
|
|
) -> list[str]:
|
|
"""Parse the given ``CUDA_VISIBLE_DEVICES`` environment variable in a separate process and return a list of device UUIDs.
|
|
|
|
The UUIDs do not have a prefix ``GPU-`` or ``MIG-``.
|
|
|
|
Args:
|
|
cuda_visible_devices (Optional[str]):
|
|
The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the
|
|
environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES``
|
|
environment variable will be unset before parsing.
|
|
|
|
Returns: List[str]
|
|
A list of device UUIDs without ``GPU-`` or ``MIG-`` prefixes.
|
|
|
|
Raises:
|
|
libcuda.CUDAError_NotInitialized:
|
|
If cannot found the CUDA driver libraries.
|
|
libcuda.CUDAError:
|
|
If failed to parse the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
|
""" # pylint: disable=line-too-long
|
|
if cuda_visible_devices is _VALUE_OMITTED:
|
|
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None)
|
|
|
|
# Do not inherit file descriptors and handles from the parent process
|
|
# The `fork` start method should be considered unsafe as it can lead to crashes of the subprocess
|
|
ctx = mp.get_context('spawn')
|
|
queue = ctx.SimpleQueue()
|
|
try:
|
|
parser = ctx.Process(
|
|
target=_cuda_visible_devices_parser,
|
|
args=(cuda_visible_devices, queue, verbose),
|
|
name='`CUDA_VISIBLE_DEVICES` parser',
|
|
daemon=True,
|
|
)
|
|
parser.start()
|
|
parser.join()
|
|
finally:
|
|
try:
|
|
parser.kill() # requires Python 3.7+
|
|
except AttributeError:
|
|
pass
|
|
result = queue.get()
|
|
|
|
if isinstance(result, Exception):
|
|
raise result
|
|
return result
|
|
|
|
|
|
def _cuda_visible_devices_parser(
|
|
cuda_visible_devices: str,
|
|
queue: mp.SimpleQueue,
|
|
verbose: bool = True,
|
|
) -> None:
|
|
try:
|
|
if cuda_visible_devices is not None:
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
|
|
else:
|
|
os.environ.pop('CUDA_VISIBLE_DEVICES', None)
|
|
|
|
# pylint: disable=no-member
|
|
try:
|
|
libcuda.cuInit()
|
|
except (
|
|
libcuda.CUDAError_NoDevice,
|
|
libcuda.CUDAError_InvalidDevice,
|
|
libcuda.CUDAError_SystemDriverMismatch,
|
|
libcuda.CUDAError_CompatNotSupportedOnDevice,
|
|
):
|
|
queue.put([])
|
|
raise
|
|
|
|
count = libcuda.cuDeviceGetCount()
|
|
uuids = list(map(libcuda.cuDeviceGetUuid, map(libcuda.cuDeviceGet, range(count))))
|
|
queue.put(uuids)
|
|
return
|
|
except Exception as ex: # noqa: BLE001 # pylint: disable=broad-except
|
|
queue.put(ex)
|
|
if verbose:
|
|
raise ex
|
|
finally:
|
|
# Ensure non-empty queue
|
|
queue.put(libcuda.CUDAError_NotInitialized()) # pylint: disable=no-member
|