docs(core): rephrase docstrings

Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
Xuehai Pan 2023-01-03 10:20:27 +00:00
parent 7aa2f068c1
commit cb84fa8197
17 changed files with 486 additions and 680 deletions

View file

@ -132,3 +132,4 @@ lol
xx
yyy
zz
CLI

View file

@ -1,7 +1,7 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
# License: GNU GPL version 3.
# pylint: disable=missing-module-docstring
"""The interactive NVIDIA-GPU process viewer."""
import sys

View file

@ -32,28 +32,25 @@ from nvitop.core import libnvml
# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
r"""
Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a
callback and in order to use it you need to assign a TensorBoard callback or
a CSVLogger callback to the model.
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.
Args:
memory_utilization (bool):
Set to :data:`True` to log used, free and the percentage of memory
utilization at the start and end of each step. Default: :data:`True`.
Set to :data:`True` to log used, free and the percentage of memory utilization at the
start and end of each step. Default: :data:`True`.
gpu_utilization (bool):
Set to :data:`True` to log the percentage of GPU utilization
at the start and end of each step. Default: :data:`True`.
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
each step. Default: :data:`True`.
intra_step_time (bool):
Set to :data:`True` to log the time of each step. Default: :data:`False`.
inter_step_time (bool):
Set to :data:`True` to log the time between the end of one step
and the start of the next step. Default: :data:`False`.
Set to :data:`True` to log the time between the end of one step and the start of the
next step. Default: :data:`False`.
fan_speed (bool):
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
temperature (bool):
Set to :data:`True` to log the gpu temperature in degree Celsius.
Default: :data:`False`.
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
Raises:
ValueError:
@ -77,16 +74,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
via fans in the surrounding enclosure.
- **memory.used** - Total memory allocated by active contexts, in MiBs.
- **memory.free** - Total free memory, in MiBs.
- **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was
executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global (device) memory was
being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
- **utilization.gpu** - Percent of time over the past sample period during which one or more
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global
(device) memory was being read or written. The sample period may be between 1 second and 1/6
second depending on the product.
- **temperature** - Core GPU temperature, in degrees C.
"""
@ -167,8 +167,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
)
def _get_gpu_stats(self) -> Dict[str, float]:
"""Get the gpu status from NVML queries"""
"""Get the gpu status from NVML queries."""
return get_gpu_stats(
devices=self._devices,
memory_utilization=self._memory_utilization,

View file

@ -33,27 +33,25 @@ from nvitop.core import libnvml
# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
r"""
Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a
callback and in order to use it you need to assign a logger in the ``Trainer``.
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a logger in the ``Trainer``.
Args:
memory_utilization (bool):
Set to :data:`True` to log used, free and the percentage of memory
utilization at the start and end of each step. Default: :data:`True`.
Set to :data:`True` to log used, free and the percentage of memory utilization at the
start and end of each step. Default: :data:`True`.
gpu_utilization (bool):
Set to :data:`True` to log the percentage of GPU utilization
at the start and end of each step. Default: :data:`True`.
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
each step. Default: :data:`True`.
intra_step_time (bool):
Set to :data:`True` to log the time of each step. Default: :data:`False`.
inter_step_time (bool):
Set to :data:`True` to log the time between the end of one step
and the start of the next step. Default: :data:`False`.
Set to :data:`True` to log the time between the end of one step and the start of the
next step. Default: :data:`False`.
fan_speed (bool):
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
temperature (bool):
Set to :data:`True` to log the gpu temperature in degree Celsius.
Default: :data:`False`.
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
Raises:
MisconfigurationException:
@ -68,16 +66,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
via fans in the surrounding enclosure.
- **memory.used** - Total memory allocated by active contexts, in MiBs.
- **memory.free** - Total free memory, in MiBs.
- **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was
executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global (device) memory was
being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
- **utilization.gpu** - Percent of time over the past sample period during which one or more
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global
(device) memory was being read or written. The sample period may be between 1 second and 1/6
second depending on the product.
- **temperature** - Core GPU temperature, in degrees C.
"""
@ -161,8 +162,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
trainer.logger.log_metrics(logs, step=trainer.global_step)
def _get_gpu_stats(self) -> Dict[str, float]:
"""Get the gpu status from NVML queries"""
"""Get the gpu status from NVML queries."""
return get_gpu_stats(
devices=self._devices,
memory_utilization=self._memory_utilization,

View file

@ -19,6 +19,9 @@
def add_scalar_dict(writer, main_tag, tag_scalar_dict, global_step=None, walltime=None):
"""Batched version of `writer.add_scalar`"""
"""Add a batch of scalars to the writer.
Batched version of ``writer.add_scalar``.
"""
for tag, scalar in tag_scalar_dict.items():
writer.add_scalar(f'{main_tag}/{tag}', scalar, global_step=global_step, walltime=walltime)

View file

@ -43,8 +43,7 @@ def get_gpu_stats(
fan_speed: bool = False,
temperature: bool = False,
) -> Dict[str, float]:
"""Get the GPU status from NVML queries"""
"""Get the GPU status from NVML queries."""
stats = {}
for device in devices:
prefix = f'gpu_id: {device.cuda_index}'

View file

@ -1,7 +1,7 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
# License: GNU GPL version 3.
# pylint: disable=missing-module-docstring,missing-function-docstring
"""The interactive NVIDIA-GPU process viewer."""
import argparse
import curses
@ -20,7 +20,7 @@ NVITOP_MONITOR_MODE = set(
def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
"""Parse command-line arguments for ``nvtiop``."""
coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format(
colored('light', 'green'), colored('moderate', 'yellow'), colored('heavy', 'red')
)
@ -252,6 +252,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-locals
"""Main function for ``nvitop`` CLI."""
args = parse_arguments()
if args.force_color:

View file

@ -15,7 +15,7 @@
# limitations under the License.
# ==============================================================================
# pylint: disable=missing-module-docstring
"""Resource metrics collectors."""
import contextlib
import itertools
@ -44,24 +44,27 @@ class SnapshotResult(NamedTuple): # pylint: disable=missing-class-docstring
timer = time.monotonic
def _unique(iterable: Iterable[Hashable]) -> List[Hashable]:
return list(OrderedDict.fromkeys(iterable).keys())
# pylint: disable-next=too-many-branches
def take_snapshots(
devices: Optional[Union[Device, Iterable[Device]]] = None,
*,
gpu_processes: Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]] = None,
) -> SnapshotResult:
"""Retrieves status of demanded devices and GPU processes.
"""Retrieve status of demanded devices and GPU processes.
Args:
devices (Optional[Union[Device, Iterable[Device]]]):
Requested devices for snapshots. If not given, the devices will be
determined from GPU processes:
- All devices (no GPU processes are given)
- Devices that used by given GPU processes
Requested devices for snapshots. If not given, the devices will be determined from GPU
processes: **(1)** All devices (no GPU processes are given); **(2)** Devices that used
by given GPU processes.
gpu_processes (Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]]):
Requested GPU processes snapshots. If not given, all GPU processes
running on the requested device will be returned. The GPU process
snapshots can be suppressed by specifying ``gpu_processes=False``.
Requested GPU processes snapshots. If not given, all GPU processes running on the
requested device will be returned. The GPU process snapshots can be suppressed by
specifying ``gpu_processes=False``.
Returns: SnapshotResult
A named tuple containing two lists of snapshots.
@ -71,7 +74,6 @@ def take_snapshots(
be returned.
Examples:
>>> from nvitop import take_snapshots, Device
>>> import os
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '1,0'
@ -136,10 +138,6 @@ def take_snapshots(
]
)
""" # pylint: disable=line-too-long
def unique(iterable: Iterable[Hashable]) -> List[Hashable]:
return list(OrderedDict.fromkeys(iterable).keys())
if isinstance(devices, Device):
devices = [devices]
if isinstance(gpu_processes, GpuProcess):
@ -148,7 +146,7 @@ def take_snapshots(
if gpu_processes is not None:
if gpu_processes: # is not False or is a non-empty list/tuple
gpu_processes = list(gpu_processes)
process_devices = unique(process.device for process in gpu_processes)
process_devices = _unique(process.device for process in gpu_processes)
for device in process_devices:
device.processes() # update GPU status for requested GPU processes
if devices is None:
@ -193,57 +191,55 @@ def collect_in_background(
tag: str = 'metrics-daemon',
start: bool = True,
) -> threading.Thread:
"""Starts a background daemon thread that collect and call the callback function periodically.
"""Start a background daemon thread that collect and call the callback function periodically.
See also :func:`ResourceMetricCollector.daemonize`.
Args:
on_collect: (Callable[[Dict[str, float]], bool])
on_collect (Callable[[Dict[str, float]], bool]):
A callback function that will be called periodically. It takes a dictionary containing
the resource metrics and returns a boolean indicating whether to continue monitoring.
collector: (Optional[ResourceMetricCollector])
collector (Optional[ResourceMetricCollector]):
A :class:`ResourceMetricCollector` instance to collect metrics. If not given, it will
collect metrics for all GPUs and subprocess of the current process.
interval: (Optional[float])
interval (Optional[float]):
The collect interval. If not given, use ``collector.interval``.
on_start: (Optional[Callable[['ResourceMetricCollector'], None]])
on_start (Optional[Callable[[ResourceMetricCollector], None]]):
A function to initialize the daemon thread and collector.
on_stop: (Optional[Callable[['ResourceMetricCollector'], None]])
on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
A function that do some necessary cleanup after the daemon thread is stopped.
tag: (str)
tag (str):
The tag prefix used for metrics results.
start: (bool)
start (bool):
Whether to start the daemon thread on return.
Returns: threading.Thread
A daemon thread object.
Examples:
.. code-block:: python
.. code-block:: python
logger = ...
logger = ...
def on_collect(metrics): # will be called periodically
if logger.is_closed(): # closed manually by user
return False
logger.log(metrics)
return True
def on_collect(metrics): # will be called periodically
if logger.is_closed(): # closed manually by user
return False
logger.log(metrics)
return True
def on_stop(collector): # will be called only once at stop
if not logger.is_closed():
logger.close() # cleanup
def on_stop(collector): # will be called only once at stop
if not logger.is_closed():
logger.close() # cleanup
# Record metrics to the logger in background every 5 seconds.
# It will collect 5-second mean/min/max for each metric.
collect_in_background(
on_collect,
ResourceMetricCollector(Device.cuda.all()),
interval=5.0,
on_stop=on_stop,
)
# Record metrics to the logger in background every 5 seconds.
# It will collect 5-second mean/min/max for each metric.
collect_in_background(
on_collect,
ResourceMetricCollector(Device.cuda.all()),
interval=5.0,
on_stop=on_stop,
)
"""
if collector is None:
collector = ResourceMetricCollector()
if isinstance(interval, (int, float)) and interval > 0:
@ -282,13 +278,13 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
Args:
devices (Iterable[Device]):
Set of Device instances for logging. If not given, all physical
devices on board will be used.
Set of Device instances for logging. If not given, all physical devices on board will be
used.
root_pids (Set[int]):
A set of PIDs, only the status of the descendant processes on the
GPUs will be collected. If not given, the PID of the current process
will be used.
interval (float): The snapshot interval for background daemon thread.
A set of PIDs, only the status of the descendant processes on the GPUs will be collected.
If not given, the PID of the current process will be used.
interval (float):
The snapshot interval for background daemon thread.
Core methods:
@ -305,7 +301,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
collector.daemonize(on_collect_fn)
Examples:
>>> import os
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'
@ -398,6 +393,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
root_pids: Optional[Iterable[int]] = None,
interval: Union[int, float] = 1.0,
) -> None:
"""Initialize the resource metric collector."""
if isinstance(interval, (int, float)) and interval > 0:
interval = float(interval)
else:
@ -440,15 +436,14 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
self._daemon_running = threading.Event()
def activate(self, tag: str) -> 'ResourceMetricCollector':
"""Starts a new metric collection with the given tag.
"""Start a new metric collection with the given tag.
Args:
tag (str):
The name of the new metric collection. The tag will be used to
identify the metric collection. It must be a unique string.
The name of the new metric collection. The tag will be used to identify the metric
collection. It must be a unique string.
Examples:
>>> collector = ResourceMetricCollector()
>>> collector.activate(tag='train') # key prefix -> 'train'
@ -457,7 +452,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
>>> collector.deactivate() # the collector has been stopped
>>> collector.activate(tag='test') # key prefix -> 'test'
"""
with self._lock:
if self._metric_buffer is None or tag not in self._tags:
self._tags.add(tag)
@ -477,11 +471,15 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
start = activate
def deactivate(self, tag: Optional[str] = None) -> 'ResourceMetricCollector':
"""Stops the current collection with the given tag and remove all sub-tags.
If the tag is not specified, deactivate the current active collection.
For nested collections, the sub-collections will be deactivated as well.
"""
"""Stop the current collection with the given tag and remove all sub-tags.
If the tag is not specified, deactivate the current active collection. For nested
collections, the sub-collections will be deactivated as well.
Args:
tag (Optional[str]):
The tag to deactivate. If :data:`None`, the current active collection will be used.
"""
with self._lock:
if self._metric_buffer is None:
if tag is not None:
@ -516,18 +514,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
Args:
tag (str):
The name of the new metric collection. The tag will be used to
identify the metric collection. It must be a unique string.
The name of the new metric collection. The tag will be used to identify the metric
collection. It must be a unique string.
Examples:
>>> collector = ResourceMetricCollector()
>>> with collector.context(tag='train'): # key prefix -> 'train'
... # Do something
... collector.collect() # -> Dict[str, float]
"""
try:
self.activate(tag=tag)
yield self
@ -537,17 +533,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
__call__ = context # alias for `with collector(tag='<tag>')`
def clear(self, tag: Optional[str] = None) -> None:
"""Resets the metric collection with the given tag. If the tag is not
specified, reset the current active collection. For nested collections,
"""Reset the metric collection with the given tag.
If the tag is not specified, reset the current active collection. For nested collections,
the sub-collections will be reset as well.
Args:
tag (Optional[str]):
The tag to reset. If None, the current active collection
will be reset.
The tag to reset. If :data:`None`, the current active collection will be reset.
Examples:
>>> collector = ResourceMetricCollector()
>>> with collector(tag='train'): # key prefix -> 'train'
@ -564,7 +559,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
... with collector(tag='batch'): # key prefix -> 'train/batch'
... collector.reset(tag='train') # reset both 'train' and 'train/batch'
"""
with self._lock:
if self._metric_buffer is None:
if tag is not None:
@ -586,8 +580,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
buffer = buffer.prev
def collect(self) -> Dict[str, float]:
"""Gets the average resource consumption during collection."""
"""Get the average resource consumption during collection."""
with self._lock:
if self._metric_buffer is None:
raise RuntimeError('Resource metric collector has not been not started yet.')
@ -607,52 +600,51 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
tag: str = 'metrics-daemon',
start: bool = True,
) -> threading.Thread:
"""Starts a background daemon thread that collect and call the callback function periodically.
"""Start a background daemon thread that collect and call the callback function periodically.
See also :func:`collect_in_background`.
Args:
on_collect: (Callable[[Dict[str, float]], bool])
on_collect (Callable[[Dict[str, float]], bool]):
A callback function that will be called periodically. It takes a dictionary containing
the resource metrics and returns a boolean indicating whether to continue monitoring.
interval: (Optional[float])
interval (Optional[float]):
The collect interval. If not given, use ``collector.interval``.
on_start: (Optional[Callable[['ResourceMetricCollector'], None]])
on_start (Optional[Callable[[ResourceMetricCollector], None]]):
A function to initialize the daemon thread and collector.
on_stop: (Optional[Callable[['ResourceMetricCollector'], None]])
on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
A function that do some necessary cleanup after the daemon thread is stopped.
tag: (str)
tag (str):
The tag prefix used for metrics results.
start: (bool)
start (bool):
Whether to start the daemon thread on return.
Returns: threading.Thread
A daemon thread object.
Examples:
.. code-block:: python
.. code-block:: python
logger = ...
logger = ...
def on_collect(metrics): # will be called periodically
if logger.is_closed(): # closed manually by user
return False
logger.log(metrics)
return True
def on_collect(metrics): # will be called periodically
if logger.is_closed(): # closed manually by user
return False
logger.log(metrics)
return True
def on_stop(collector): # will be called only once at stop
if not logger.is_closed():
logger.close() # cleanup
def on_stop(collector): # will be called only once at stop
if not logger.is_closed():
logger.close() # cleanup
# Record metrics to the logger in background every 5 seconds.
# It will collect 5-second mean/min/max for each metric.
ResourceMetricCollector(Device.cuda.all()).daemonize(
on_collect,
ResourceMetricCollector(Device.cuda.all()),
interval=5.0,
on_stop=on_stop,
)
# Record metrics to the logger in background every 5 seconds.
# It will collect 5-second mean/min/max for each metric.
ResourceMetricCollector(Device.cuda.all()).daemonize(
on_collect,
ResourceMetricCollector(Device.cuda.all()),
interval=5.0,
on_stop=on_stop,
)
"""
return collect_in_background(
on_collect,
@ -665,10 +657,12 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
)
def __del__(self) -> None:
"""Clean up the demon thread on destruction."""
self._daemon_running.clear()
# pylint: disable-next=missing-function-docstring,too-many-branches,too-many-locals,too-many-statements
# pylint: disable-next=too-many-branches,too-many-locals,too-many-statements
def take_snapshots(self) -> SnapshotResult:
"""Take snapshots of the current resource metrics and update the metric buffer."""
if len(self.root_pids) > 0:
all_gpu_processes = []
for device in self.leaf_devices:

File diff suppressed because it is too large Load diff

View file

@ -16,8 +16,8 @@
# ==============================================================================
"""Shortcuts for package ``psutil``.
psutil is a cross-platform library for retrieving information on running processes
and system utilization (CPU, memory, disks, network, sensors) in Python.
``psutil`` is a cross-platform library for retrieving information on running processes and system
utilization (CPU, memory, disks, network, sensors) in Python.
"""
import os as _os
@ -50,31 +50,30 @@ swap_memory = _ttl_cache(ttl=0.25)(_psutil.swap_memory)
try:
load_average = _ttl_cache(ttl=2.0)(_psutil.getloadavg)
load_average.__doc__ = """Get the system load average."""
except AttributeError:
def load_average(): # pylint: disable=missing-function-docstring
def load_average():
"""Get the system load average."""
return None
def memory_percent():
"""The percentage usage of virtual memory, calculated as (total - available) / total * 100."""
"""The percentage usage of virtual memory, calculated as ``(total - available) / total * 100``."""
return virtual_memory().percent
def swap_percent():
"""The percentage usage of virtual memory, calculated as used / total * 100."""
"""The percentage usage of virtual memory, calculated as ``used / total * 100``."""
return swap_memory().percent
ppid_map = _psutil._ppid_map # pylint: disable=protected-access
"""Obtains a ``{pid: ppid, ...}`` dict for all running processes in one shot."""
"""Obtain a ``{pid: ppid, ...}`` dict for all running processes in one shot."""
def reverse_ppid_map(): # pylint: disable=function-redefined
"""Obtains a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot."""
"""Obtain a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot."""
from collections import defaultdict # pylint: disable=import-outside-toplevel
tree = defaultdict(list)

View file

@ -29,11 +29,11 @@ from typing import Type as _Type
# pylint: disable-next=missing-class-docstring,too-few-public-methods
class struct_c_CUdevice_t(_ctypes.Structure):
class _struct_c_CUdevice_t(_ctypes.Structure):
pass # opaque handle
c_CUdevice_t = _ctypes.POINTER(struct_c_CUdevice_t)
_c_CUdevice_t = _ctypes.POINTER(_struct_c_CUdevice_t)
_CUresult_t = _ctypes.c_uint
@ -229,8 +229,7 @@ class CUDAError(Exception):
_errcode_to_name = {}
def __new__(cls, value: int) -> 'CUDAError':
"""Maps value to a proper subclass of :class:`CUDAError`."""
"""Map value to a proper subclass of :class:`CUDAError`."""
if cls is CUDAError:
# pylint: disable-next=self-cls-assignment
cls = CUDAError._value_class_mapping.get(value, cls)
@ -239,6 +238,7 @@ class CUDAError(Exception):
return obj
def __str__(self) -> str:
"""Return a string representation of the error."""
# pylint: disable=no-member
try:
if self.value not in CUDAError._errcode_to_string:
@ -255,30 +255,32 @@ class CUDAError(Exception):
except CUDAError:
return f'CUDA Error with code {self.value}.'
__repr__ = __str__
def __eq__(self, other: object) -> bool:
"""Test equality to other object."""
if not isinstance(other, CUDAError):
return NotImplemented
return self.value == other.value # pylint: disable=no-member
def __reduce__(self) -> _Tuple[_Type['CUDAError'], _Tuple[int]]:
"""Return state information for pickling."""
return CUDAError, (self.value,) # pylint: disable=no-member
def cudaExceptionClass(cudaErrorCode: int) -> _Type[CUDAError]:
"""Maps value to a proper subclass of :class:`CUDAError`.
"""Map value to a proper subclass of :class:`CUDAError`.
Raises:
ValueError: If the error code is not valid.
"""
# pylint: disable=protected-access
if cudaErrorCode not in CUDAError._value_class_mapping:
if cudaErrorCode not in CUDAError._value_class_mapping: # pylint: disable=protected-access
raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.')
return CUDAError._value_class_mapping[cudaErrorCode]
return CUDAError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access
def _extract_cuda_errors_as_classes() -> None:
"""Generates a hierarchy of classes on top of :class:`CUDAError` class.
"""Generate a hierarchy of classes on top of :class:`CUDAError` class.
Each CUDA Error gets a new :class:`CUDAError` subclass. This way try-except blocks can filter
appropriate exceptions more easily.
@ -286,7 +288,6 @@ def _extract_cuda_errors_as_classes() -> None:
:class:`CUDAError` is a parent class. Each ``CUDA_ERROR_*`` gets it's own subclass.
e.g. :data:`CUDA_ERROR_INVALID_VALUE` will be turned into :class:`CUDAError_InvalidValue`.
"""
this_module = _sys.modules[__name__]
cuda_error_names = [x for x in dir(this_module) if x.startswith('CUDA_ERROR_')]
for err_name in cuda_error_names:
@ -339,8 +340,7 @@ __cudaGetFunctionPointer_cache = {}
def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
"""
Get the function pointer from the CUDA driver library.
"""Get the function pointer from the CUDA driver library.
Raises:
CUDAError_NotInitialized:
@ -348,7 +348,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
CUDAError_NotFound:
If cannot found the function pointer.
"""
if name in __cudaGetFunctionPointer_cache:
return __cudaGetFunctionPointer_cache[name]
@ -364,14 +363,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
def __LoadCudaLibrary() -> None:
"""
Load the library if it isn't loaded already.
"""Load the library if it isn't loaded already.
Raises:
CUDAError_NotInitialized:
If cannot found the CUDA driver library.
"""
global __cudaLib # pylint: disable=global-statement
if __cudaLib is None:
@ -409,7 +406,7 @@ def __LoadCudaLibrary() -> None:
def cuInit(flags: int = 0) -> None:
"""Initialize the CUDA driver API.
Initializes the driver API and must be called before any other function from the driver API.
Initialize the driver API and must be called before any other function from the driver API.
Currently, the ``flags`` parameter must be :data:`0`. If :func:`cuInit` has not been called,
any function from the driver API will return :data:`CUDA_ERROR_NOT_INITIALIZED`.
@ -429,7 +426,6 @@ def cuInit(flags: int = 0) -> None:
CUDAError_NotInitialized:
If cannot found the CUDA driver library.
"""
global __initialized # pylint: disable=global-statement
__LoadCudaLibrary()
@ -447,7 +443,7 @@ def cuInit(flags: int = 0) -> None:
def cuGetErrorName(error: int) -> str:
"""Gets the string representation of an error code enum name.
"""Get the string representation of an error code enum name.
Raises:
CUDAError_InvalidValue:
@ -455,7 +451,6 @@ def cuGetErrorName(error: int) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuGetErrorName')
p_name = _ctypes.POINTER(_ctypes.c_char_p)()
@ -466,7 +461,7 @@ def cuGetErrorName(error: int) -> str:
def cuGetErrorString(error: int) -> str:
"""Gets the string description of an error code.
"""Get the string description of an error code.
Raises:
CUDAError_InvalidValue:
@ -474,7 +469,6 @@ def cuGetErrorString(error: int) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuGetErrorString')
p_name = _ctypes.POINTER(_ctypes.c_char_p)()
@ -485,7 +479,7 @@ def cuGetErrorString(error: int) -> str:
def cuDriverGetVersion() -> str:
"""Returns the latest CUDA version supported by driver.
"""Get the latest CUDA version supported by driver.
Returns:
A string of the form :data:`'<major>.<minor>'`.
@ -496,7 +490,6 @@ def cuDriverGetVersion() -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDriverGetVersion')
driver_version = _ctypes.c_int()
@ -508,7 +501,7 @@ def cuDriverGetVersion() -> str:
def cuDeviceGetCount() -> int:
"""Returns the number of compute-capable devices.
"""Get the number of compute-capable devices.
Returns: int
The number of devices with compute capability greater than or equal to 2.0 that are available
@ -524,7 +517,6 @@ def cuDeviceGetCount() -> int:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGetCount')
count = _ctypes.c_int(0)
@ -533,8 +525,8 @@ def cuDeviceGetCount() -> int:
return count.value
def cuDeviceGet(ordinal: int) -> c_CUdevice_t:
"""Returns a handle to a compute device.
def cuDeviceGet(ordinal: int) -> _c_CUdevice_t:
"""Get a handle to a compute device.
Returns:
A device handle given an ordinal in the range :code:`[0, ..., cuDeviceGetCount() - 1]`.
@ -552,20 +544,19 @@ def cuDeviceGet(ordinal: int) -> c_CUdevice_t:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGet')
device = c_CUdevice_t()
device = _c_CUdevice_t()
ret = fn(_ctypes.byref(device), _ctypes.c_int(ordinal))
_cudaCheckReturn(ret)
return device
def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t:
"""Returns a handle to a compute device.
def cuDeviceGetByPCIBusId(pciBusId: str) -> _c_CUdevice_t:
"""Get a handle to a compute device.
Args:
pciBusId: str
pciBusId (str):
String in one of the following forms: ``[domain]:[bus]:[device].[function]``,
``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``,
``device``, and ``function`` are all hexadecimal values.
@ -584,17 +575,16 @@ def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGetByPCIBusId')
device = c_CUdevice_t()
device = _c_CUdevice_t()
ret = fn(_ctypes.byref(device), _ctypes.c_char_p(pciBusId.encode('UTF-8')))
_cudaCheckReturn(ret)
return device
def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
"""Returns a PCI Bus Id string for the device.
def cuDeviceGetPCIBusId(device: _c_CUdevice_t) -> str:
"""Get a PCI Bus Id string for the device.
Returns: str
An identifier string for the device in the following format ``[domain]:[bus]:[device].[function]``
@ -611,7 +601,6 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGetPCIBusId')
pciBusId = _ctypes.create_string_buffer(256)
@ -620,8 +609,8 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
return pciBusId.value.decode('UTF-8', errors='replace')
def cuDeviceGetName(device: c_CUdevice_t) -> str:
"""Returns an identifier string for the device.
def cuDeviceGetName(device: _c_CUdevice_t) -> str:
"""Get an identifier string for the device.
Returns: str
An ASCII string identifying the device.
@ -639,7 +628,6 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGetName')
name = _ctypes.create_string_buffer(256)
@ -648,8 +636,8 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str:
return name.value.decode('UTF-8', errors='replace')
def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
"""Returns a UUID for the device.
def cuDeviceGetUuid(device: _c_CUdevice_t) -> str:
"""Get a UUID for the device.
Raises:
CUDAError_InvalidDevice:
@ -662,7 +650,6 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
try:
fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2')
except AttributeError:
@ -676,8 +663,8 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32]))
def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
"""Returns a UUID for the device (CUDA 11.4+).
def cuDeviceGetUuid_v2(device: _c_CUdevice_t) -> str:
"""Get a UUID for the device (CUDA 11.4+).
Raises:
CUDAError_InvalidDevice:
@ -690,7 +677,6 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2')
ubyte_array = _ctypes.c_ubyte * 16
@ -701,8 +687,8 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32]))
def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
"""Returns the total amount of memory on the device (in bytes).
def cuDeviceTotalMem(device: _c_CUdevice_t) -> int:
"""Get the total amount of memory on the device (in bytes).
Raises:
CUDAError_InvalidContext:
@ -717,7 +703,6 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
CUDAError_NotInitialized:
If the CUDA driver API is not initialized.
"""
fn = __cudaGetFunctionPointer('cuDeviceTotalMem')
bytes = _ctypes.c_size_t() # pylint: disable=redefined-builtin
@ -727,8 +712,7 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
def is_available() -> bool:
"""Whether there are any CUDA visible devices."""
"""Test whether there are any CUDA visible devices."""
try:
return cuDeviceGetCount() > 0
except CUDAError:

View file

@ -281,8 +281,7 @@ class cudaError(Exception):
_errcode_to_name = {}
def __new__(cls, value: int) -> 'cudaError':
"""Maps value to a proper subclass of :class:`cudaError`."""
"""Map value to a proper subclass of :class:`cudaError`."""
if cls is cudaError:
# pylint: disable-next=self-cls-assignment
cls = cudaError._value_class_mapping.get(value, cls)
@ -291,6 +290,7 @@ class cudaError(Exception):
return obj
def __str__(self) -> str:
"""Return a string representation of the error."""
# pylint: disable=no-member
try:
if self.value not in cudaError._errcode_to_string:
@ -307,30 +307,32 @@ class cudaError(Exception):
except cudaError:
return f'CUDA Error with code {self.value}.'
__repr__ = __str__
def __eq__(self, other: object) -> bool:
"""Test equality to other object."""
if not isinstance(other, cudaError):
return NotImplemented
return self.value == other.value # pylint: disable=no-member
def __reduce__(self) -> _Tuple[_Type['cudaError'], _Tuple[int]]:
"""Return state information for pickling."""
return cudaError, (self.value,) # pylint: disable=no-member
def cudaExceptionClass(cudaErrorCode: int) -> _Type[cudaError]:
"""Maps value to a proper subclass of :class:`cudaError`.
"""Map value to a proper subclass of :class:`cudaError`.
Raises:
ValueError: If the error code is not valid.
"""
# pylint: disable=protected-access
if cudaErrorCode not in cudaError._value_class_mapping:
if cudaErrorCode not in cudaError._value_class_mapping: # pylint: disable=protected-access
raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.')
return cudaError._value_class_mapping[cudaErrorCode]
return cudaError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access
def _extract_cuda_errors_as_classes() -> None:
"""Generates a hierarchy of classes on top of :class:`cudaError` class.
"""Generate a hierarchy of classes on top of :class:`cudaError` class.
Each CUDA Error gets a new :class:`cudaError` subclass. This way try-except blocks can filter
appropriate exceptions more easily.
@ -338,7 +340,6 @@ def _extract_cuda_errors_as_classes() -> None:
:class:`cudaError` is a parent class. Each ``cudaError*`` gets it's own subclass.
e.g. :data:`cudaErrorInvalidValue` will be turned into :class:`cudaError_InvalidValue`.
"""
this_module = _sys.modules[__name__]
cuda_error_names = [
x
@ -393,8 +394,7 @@ __cudaGetFunctionPointer_cache = {}
def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
"""
Get the function pointer from the CUDA Runtime library.
"""Get the function pointer from the CUDA Runtime library.
Raises:
cudaError_InitializationError:
@ -402,7 +402,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
cudaError_SymbolNotFound:
If cannot found the function pointer.
"""
if name in __cudaGetFunctionPointer_cache:
return __cudaGetFunctionPointer_cache[name]
@ -418,14 +417,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches
"""
Load the library if it isn't loaded already.
"""Load the library if it isn't loaded already.
Raises:
cudaError_InitializationError:
If cannot found the CUDA Runtime library.
"""
global __cudaLib # pylint: disable=global-statement
if __cudaLib is None:
@ -498,7 +495,7 @@ def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches
def cudaGetErrorName(error: int) -> str:
"""Returns the string representation of an error code enum name.
"""Get the string representation of an error code enum name.
Returns: str
A string containing the name of an error code in the enum. If the error code is not
@ -508,7 +505,6 @@ def cudaGetErrorName(error: int) -> str:
cudaError_InitializationError:
If cannot found the CUDA Runtime library.
"""
fn = __cudaGetFunctionPointer('cudaGetErrorName')
fn.restype = _ctypes.c_char_p # otherwise return is an int
@ -518,7 +514,7 @@ def cudaGetErrorName(error: int) -> str:
def cuGetErrorString(error: int) -> str:
"""Returns the description string for an error code.
"""Get the description string for an error code.
Returns: str
The description string for an error code. If the error code is not recognized, "unrecognized
@ -528,7 +524,6 @@ def cuGetErrorString(error: int) -> str:
cudaError_InitializationError:
If cannot found the CUDA Runtime library.
"""
fn = __cudaGetFunctionPointer('cudaGetErrorString')
fn.restype = _ctypes.c_char_p # otherwise return is an int
@ -538,7 +533,7 @@ def cuGetErrorString(error: int) -> str:
def cudaGetLastError() -> int:
"""Returns the last error from a runtime call.
"""Get the last error from a runtime call.
Returns: int
The last error that has been produced by any of the runtime calls in the same instance of
@ -552,13 +547,12 @@ def cudaGetLastError() -> int:
cudaError_NoDevice:
If no CUDA-capable devices were detected by the installed CUDA driver.
"""
fn = __cudaGetFunctionPointer('cudaGetLastError')
return fn()
def cudaPeekAtLastError() -> int:
"""Returns the last error from a runtime call.
"""Get the last error from a runtime call.
Returns: int
The last error that has been produced by any of the runtime calls in the same instance of
@ -573,13 +567,12 @@ def cudaPeekAtLastError() -> int:
cudaError_NoDevice:
If no CUDA-capable devices were detected by the installed CUDA driver.
"""
fn = __cudaGetFunctionPointer('cudaPeekAtLastError')
return fn()
def cudaDriverGetVersion() -> str:
"""Returns the latest CUDA version supported by driver.
"""Get the latest CUDA version supported by driver.
Returns: str
The latest version of CUDA supported by the driver of the form :data:`'<major>.<minor>'`.
@ -592,7 +585,6 @@ def cudaDriverGetVersion() -> str:
cudaError_NoDevice:
If no CUDA-capable devices were detected by the installed CUDA driver.
"""
fn = __cudaGetFunctionPointer('cudaDriverGetVersion')
driver_version = _ctypes.c_int()
@ -604,7 +596,7 @@ def cudaDriverGetVersion() -> str:
def cudaRuntimeGetVersion() -> str:
"""Returns the CUDA Runtime version.
"""Get the CUDA Runtime version.
Returns: str
The version number of the current CUDA Runtime instance of the form :data:`'<major>.<minor>'`.
@ -617,7 +609,6 @@ def cudaRuntimeGetVersion() -> str:
cudaError_NoDevice:
If no CUDA-capable devices were detected by the installed CUDA driver.
"""
fn = __cudaGetFunctionPointer('cudaRuntimeGetVersion')
runtime_version = _ctypes.c_int()
@ -629,7 +620,7 @@ def cudaRuntimeGetVersion() -> str:
def cudaGetDeviceCount() -> int:
"""Returns the number of compute-capable devices.
"""Get the number of compute-capable devices.
Returns: int
The number of devices with compute capability greater or equal to 2.0 that are available for
@ -643,7 +634,6 @@ def cudaGetDeviceCount() -> int:
cudaError_NoDevice:
If no CUDA-capable devices were detected by the installed CUDA driver.
"""
fn = __cudaGetFunctionPointer('cudaGetDeviceCount')
count = _ctypes.c_int(0)
@ -653,10 +643,10 @@ def cudaGetDeviceCount() -> int:
def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
"""Returns a handle to a compute device.
"""Get a handle to a compute device.
Args:
pciBusId: str
pciBusId (str):
String in one of the following forms: ``[domain]:[bus]:[device].[function]``,
``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``,
``device``, and ``function`` are all hexadecimal values.
@ -676,7 +666,6 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
cudaError_InvalidDevice:
If the device ordinal supplied by the user does not correspond to a valid CUDA device.
"""
fn = __cudaGetFunctionPointer('cudaDeviceGetByPCIBusId')
device = _ctypes.c_int()
@ -686,7 +675,7 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
def cudaDeviceGetPCIBusId(device: int) -> str:
"""Returns a PCI Bus Id string for the device.
"""Get a PCI Bus Id string for the device.
Returns: str
An ASCII string identifying the device.
@ -703,7 +692,6 @@ def cudaDeviceGetPCIBusId(device: int) -> str:
cudaError_InvalidDevice:
If the device ordinal supplied by the user does not correspond to a valid CUDA device.
"""
fn = __cudaGetFunctionPointer('cudaDeviceGetPCIBusId')
pciBusId = _ctypes.create_string_buffer(256)
@ -713,8 +701,7 @@ def cudaDeviceGetPCIBusId(device: int) -> str:
def is_available() -> bool:
"""Whether there are any CUDA visible devices."""
"""Test whether there are any CUDA visible devices."""
try:
return cudaGetDeviceCount() > 0
except cudaError:

View file

@ -67,9 +67,9 @@ if not callable(getattr(_pynvml, 'nvmlInitWithFlags', None)):
NVMLError = _pynvml.NVMLError
NVMLError.__doc__ = """Base exception class for NVML query errors."""
NVMLError.__new__.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`."""
NVMLError.__new__.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""
nvmlExceptionClass = _pynvml.nvmlExceptionClass
nvmlExceptionClass.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`."""
nvmlExceptionClass.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""
# Load members from module `pynvml` and register them in `__all__` and globals.
_vars_pynvml = vars(_pynvml)
@ -143,7 +143,7 @@ Functions and Exceptions
.. function:: __exit__(*args, **kwargs) -> None
Shutdowns the NVML context in the context manager for ``with`` statement.
Shutdown the NVML context in the context manager for ``with`` statement.
""".format('\n\n'.join(_data_docs)) # fmt: skip
@ -203,7 +203,7 @@ VERSIONED_PATTERN = _re.compile(r'^(?P<name>\w+)(?P<suffix>_v(\d)+)$')
def _lazy_init() -> None:
"""Lazily initializes the NVML context.
"""Lazily initialize the NVML context.
Raises:
NVMLError_LibraryNotFound:
@ -217,7 +217,6 @@ def _lazy_init() -> None:
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
"""
with __lock:
if __initialized:
return
@ -225,7 +224,7 @@ def _lazy_init() -> None:
def nvmlInit() -> None: # pylint: disable=function-redefined
"""Initializes the NVML context with default flag (0).
"""Initialize the NVML context with default flag (0).
Raises:
NVMLError_LibraryNotFound:
@ -239,12 +238,11 @@ def nvmlInit() -> None: # pylint: disable=function-redefined
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
"""
nvmlInitWithFlags(0)
def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
"""Initializes the NVML context with the given flags.
"""Initialize the NVML context with the given flags.
Raises:
NVMLError_LibraryNotFound:
@ -258,7 +256,6 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
"""
global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned
with __lock:
@ -312,7 +309,7 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
def nvmlShutdown() -> None: # pylint: disable=function-redefined
"""Shutdowns the NVML context.
"""Shutdown the NVML context.
Raises:
NVMLError_LibraryNotFound:
@ -325,7 +322,6 @@ def nvmlShutdown() -> None: # pylint: disable=function-redefined
NVMLError_Uninitialized:
If NVML was not first initialized with :func:`nvmlInit`.
"""
global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned
_pynvml.nvmlShutdown()
@ -345,8 +341,9 @@ def nvmlQuery(
ignore_function_not_found: bool = False,
**kwargs,
) -> _Any:
"""Calls a function with the given arguments from NVML. The NVML context will be automatically
initialized.
"""Call a function with the given arguments from NVML.
The NVML context will be automatically initialized.
Args:
func (Union[Callable[..., Any], str]):
@ -380,7 +377,6 @@ def nvmlQuery(
NVMLError_InvalidArgument:
If passed with an invalid argument.
"""
global UNKNOWN_FUNCTIONS # pylint: disable=global-statement,global-variable-not-assigned
_lazy_init()
@ -429,8 +425,7 @@ def nvmlQuery(
def nvmlCheckReturn(
retval: _Any, types: _Optional[_Union[_Type, _Tuple[_Type, ...]]] = None
) -> bool:
"""Checks the return value is not :const:`nvitop.NA` and is one of the given types."""
"""Check whether the return value is not :const:`nvitop.NA` and is one of the given types."""
if types is None:
return retval != NA
return retval != NA and isinstance(retval, types)
@ -474,8 +469,6 @@ def __patch_backward_compatibility_layers() -> None:
)
def patch_function_pointers_when_fail(names, callback):
"""Patches the function pointers of the NVML library."""
def wrapper(nvmlGetFunctionPointer):
@_functools.wraps(nvmlGetFunctionPointer)
def wrapped(name):
@ -586,7 +579,7 @@ _driver_get_memory_info_v2_available = None if not _pynvml_installation_corrupte
def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-many-branches
"""Retrieves the amount of used, free, reserved and total memory available on the device, in bytes.
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
Note:
- The version 2 API adds additional memory information. The reserved amount is supported on
@ -607,7 +600,6 @@ def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-m
NVMLError_Unknown:
On any unexpected error.
"""
global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement
_lazy_init()
@ -702,8 +694,7 @@ class _CustomModule(_ModuleType):
"""
def __getattribute__(self, name: str) -> _Union[_Any, _Callable[..., _Any]]:
"""Gets a member from the current module. Fallback to the original package if missing."""
"""Get a member from the current module. Fallback to the original package if missing."""
try:
return super().__getattribute__(name)
except AttributeError:
@ -711,18 +702,15 @@ class _CustomModule(_ModuleType):
def __enter__(self) -> '_CustomModule':
"""Entry of the context manager for ``with`` statement."""
_lazy_init()
return self
def __exit__(self, *args, **kwargs) -> None:
"""Shutdowns the NVML context in the context manager for ``with`` statement."""
"""Shutdown the NVML context in the context manager for ``with`` statement."""
self.__del__()
def __del__(self) -> None:
"""Automatically shutdowns the NVML context on destruction."""
"""Automatically shutdown the NVML context on destruction."""
try:
nvmlShutdown()
except NVMLError:

View file

@ -49,8 +49,7 @@ __all__ = ['HostProcess', 'GpuProcess', 'command_join']
if host.POSIX:
def add_quotes(s: str) -> str:
"""Returns a shell-escaped version of the string."""
"""Return a shell-escaped version of the string."""
if s == '':
return '""'
if '$' not in s and '\\' not in s and '\n' not in s:
@ -67,8 +66,7 @@ if host.POSIX:
elif host.WINDOWS:
def add_quotes(s: str) -> str:
"""Returns a shell-escaped version of the string."""
"""Return a shell-escaped version of the string."""
if s == '':
return '""'
if '%' not in s and '^' not in s and '\n' not in s:
@ -83,14 +81,12 @@ elif host.WINDOWS:
else:
def add_quotes(s: str) -> str:
"""Returns a shell-escaped version of the string."""
"""Return a shell-escaped version of the string."""
return '"{}"'.format(s.replace('\n', r'\n'))
def command_join(cmdline: List[str]) -> str:
"""Returns a shell-escaped string from command line arguments."""
"""Return a shell-escaped string from command line arguments."""
if len(cmdline) == 1 and not (
# May be modified by `setproctitle`
os.path.isfile(cmdline[0])
@ -105,10 +101,10 @@ _USE_FALLBACK_WHEN_RAISE = threading.local() # see also `GpuProcess.failsafe`
def auto_garbage_clean(fallback=_RAISE):
"""Removes the object references in the instance cache if the method call fails (the process is gone).
"""Remove the object references in the instance cache if the method call fails (the process is gone).
The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise raises an
exception when falls.
The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise
raises an exception when falls.
"""
def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
@ -143,12 +139,12 @@ def auto_garbage_clean(fallback=_RAISE):
class HostProcess(host.Process, metaclass=ABCMeta):
"""Represents an OS process with the given PID.
If PID is omitted current process PID (:func:`os.getpid`) is used.
The instance will be cache during the lifetime of the process.
"""Represent an OS process with the given PID.
If PID is omitted current process PID (:func:`os.getpid`) is used. The instance will be cache
during the lifetime of the process.
Examples:
>>> HostProcess() # the current process
HostProcess(pid=12345, name='python3', status='running', started='00:55:43')
@ -186,8 +182,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
INSTANCES = WeakValueDictionary()
def __new__(cls, pid: Optional[int] = None) -> 'HostProcess':
"""Returns the cached instance of :class:`HostProcess`."""
"""Return the cached instance of :class:`HostProcess`."""
if pid is None:
pid = os.getpid()
@ -215,7 +210,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
# pylint: disable-next=unused-argument,super-init-not-called
def __init__(self, pid: Optional[int] = None) -> None:
pass
"""Initialize the instance."""
@property
def _gone(self) -> bool:
@ -232,17 +227,20 @@ class HostProcess(host.Process, metaclass=ABCMeta):
self._super_gone = value
def __str__(self) -> str:
"""Return a string representation of the process."""
return super().__str__().replace(self.__class__.__module__ + '.', '', 1)
__repr__ = __str__
def __reduce__(self) -> Tuple[Type['HostProcess'], Tuple[int]]:
"""Return state information for pickling."""
return self.__class__, (self.pid,)
if host.WINDOWS:
def username(self) -> str:
"""The name of the user that owns the process.
On Windows, the domain name will be removed if it is present.
Raises:
@ -251,7 +249,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
if self._username is None: # pylint: disable=access-member-before-definition
self._username = ( # pylint: disable=attribute-defined-outside-init
super().username().split('\\')[-1]
@ -262,6 +259,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
def username(self) -> str:
"""The name of the user that owns the process.
On UNIX this is calculated by using *real* process uid.
Raises:
@ -270,7 +268,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
if self._username is None: # pylint: disable=access-member-before-definition
self._username = ( # pylint: disable=attribute-defined-outside-init
super().username()
@ -287,14 +284,13 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
cmdline = super().cmdline()
if len(cmdline) > 1:
cmdline = '\0'.join(cmdline).rstrip('\0').split('\0')
return cmdline
def command(self) -> str:
"""Returns a shell-escaped string from command line arguments.
"""Return a shell-escaped string from command line arguments.
Raises:
host.NoSuchProcess:
@ -302,7 +298,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return command_join(self.cmdline())
@memoize_when_activated
@ -315,7 +310,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return datetime.datetime.now() - datetime.datetime.fromtimestamp(self.create_time())
def running_time_human(self) -> str:
@ -327,7 +321,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return timedelta2human(self.running_time())
def running_time_in_seconds(self) -> float: # in seconds
@ -339,7 +332,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return self.running_time().total_seconds()
elapsed_time = running_time
@ -355,11 +347,10 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return self.memory_info().rss
def parent(self) -> Union['HostProcess', None]:
"""Returns the parent process as a :class:`HostProcess` instance. Returns :data:`None` if there is no parent.
"""Return the parent process as a :class:`HostProcess` instance or :data:`None` if there is no parent.
Raises:
host.NoSuchProcess:
@ -367,7 +358,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
parent = super().parent()
if parent is not None:
return HostProcess(parent.pid)
@ -375,6 +365,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
def children(self, recursive: bool = False) -> List['HostProcess']:
"""Return the children of this process as a list of :class:`HostProcess` instances.
If *recursive* is :data:`True` return all the descendants.
Raises:
@ -383,13 +374,11 @@ class HostProcess(host.Process, metaclass=ABCMeta):
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
return [HostProcess(child.pid) for child in super().children(recursive)]
@contextlib.contextmanager
def oneshot(self):
"""Utility context manager which considerably speeds up the retrieval of multiple process
information at the same time.
"""A utility context manager which considerably speeds up the retrieval of multiple process information at the same time.
Internally different process info (e.g. name, ppid, uids, gids, ...) may be fetched by using
the same routine, but only one information is returned and the others are discarded. When
@ -400,7 +389,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
time you retrieve more than one information about the process.
Examples:
>>> from nvitop import HostProcess
>>> p = HostProcess()
>>> with p.oneshot():
@ -408,8 +396,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
... p.cpu_times() # return cached value
... p.cpu_percent() # return cached value
... p.create_time() # return cached value
"""
""" # pylint: disable=line-too-long
with self._lock:
if hasattr(self, '_cache'):
yield
@ -427,8 +414,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
def as_snapshot(
self, attrs: Optional[Iterable[str]] = None, ad_value: Optional[Any] = None
) -> Snapshot:
"""Returns a onetime snapshot of the process."""
"""Return a onetime snapshot of the process."""
with self.oneshot():
attributes = self.as_dict(attrs=attrs, ad_value=ad_value)
@ -444,11 +430,12 @@ class HostProcess(host.Process, metaclass=ABCMeta):
@HostProcess.register
class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-public-methods
"""Represents a process with the given PID running on the given GPU device.
"""Represent a process with the given PID running on the given GPU device.
The instance will be cache during the lifetime of the process.
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances representing the
same PID on the host but different GPU devices are different.
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances
representing the same PID on the host but different GPU devices are different.
"""
INSTANCE_LOCK = threading.RLock()
@ -466,8 +453,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin
# pylint: enable=unused-argument
) -> 'GpuProcess':
"""Returns the cached instance of :class:`GpuProcess`."""
"""Return the cached instance of :class:`GpuProcess`."""
if pid is None:
pid = os.getpid()
@ -503,8 +489,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
compute_instance_id: Optional[Union[int, NaType]] = None,
type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin
) -> None:
"""Initializes the instance returned by :meth:`__new__()`."""
"""Initialize the instance returned by :meth:`__new__()`."""
if gpu_memory is None and not hasattr(self, '_gpu_memory'):
gpu_memory = NA
if gpu_memory is not None:
@ -531,6 +516,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
setattr(self, f'_gpu_{util}_utilization', NA)
def __str__(self) -> str:
"""Return a string representation of the GPU process."""
return '{}(pid={}, gpu_memory={}, type={}, device={}, host={})'.format(
self.__class__.__name__,
self.pid,
@ -543,20 +529,19 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
__repr__ = __str__
def __eq__(self, other: object) -> bool:
"""Test equality to other object."""
if not isinstance(other, (GpuProcess, host.Process)):
return NotImplemented
return self._ident == other._ident
def __ne__(self, other: object) -> bool:
return not self == other
def __hash__(self) -> int:
"""Return a hash value of the GPU process."""
if self._hash is None: # pylint: disable=access-member-before-definition
self._hash = hash(self._ident) # pylint: disable=attribute-defined-outside-init
return self._hash
def __getattr__(self, name: str) -> Union[Any, Callable[..., Any]]:
"""Gets a member from the instance. Fallback to the host process instance if missing.
"""Get a member from the instance or fallback to the host process instance if missing.
Raises:
AttributeError:
@ -566,7 +551,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
host.AccessDenied:
If the user do not have read privilege to the process' status file.
"""
try:
return super().__getattr__(name)
except AttributeError:
@ -582,74 +566,60 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
@property
def pid(self) -> int:
"""The process PID."""
return self._pid
@property
def host(self) -> HostProcess:
"""The process instance running on the host."""
return self._host
@property
def device(self) -> 'Device':
"""The GPU device the process running on.
The same host process can use multiple GPU devices.
The :class:`GpuProcess` instances representing the same PID on the host
but different GPU devices are different.
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances
representing the same PID on the host but different GPU devices are different.
"""
return self._device
def gpu_instance_id(self) -> Union[int, NaType]:
"""The GPU instance ID of the MIG device, or :const:`nvitop.NA` if not applicable."""
return self._gpu_instance_id
def compute_instance_id(self) -> Union[int, NaType]:
"""The compute instance ID of the MIG device, or :const:`nvitop.NA` if not applicable."""
return self._compute_instance_id
def gpu_memory(self) -> Union[int, NaType]: # in bytes
"""The used GPU memory in bytes, or :const:`nvitop.NA` if not applicable."""
return self._gpu_memory
def gpu_memory_human(self) -> Union[str, NaType]: # in human readable
"""The used GPU memory in human readable format, or :const:`nvitop.NA` if not applicable."""
return self._gpu_memory_human
def gpu_memory_percent(self) -> Union[float, NaType]: # in percentage
"""The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable."""
return self._gpu_memory_percent
def gpu_sm_utilization(self) -> Union[int, NaType]: # in percentage
"""The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable."""
return self._gpu_sm_utilization
def gpu_memory_utilization(self) -> Union[int, NaType]: # in percentage
"""The utilization rate of GPU memory bandwidth, or :const:`nvitop.NA` if not applicable."""
return self._gpu_memory_utilization
def gpu_encoder_utilization(self) -> Union[int, NaType]: # in percentage
"""The utilization rate of the encoder, or :const:`nvitop.NA` if not applicable."""
return self._gpu_encoder_utilization
def gpu_decoder_utilization(self) -> Union[int, NaType]: # in percentage
"""The utilization rate of the decoder, or :const:`nvitop.NA` if not applicable."""
return self._gpu_decoder_utilization
def set_gpu_memory(self, value: Union[int, NaType]) -> None:
"""Sets the used GPU memory in bytes."""
"""Set the used GPU memory in bytes."""
# pylint: disable=attribute-defined-outside-init
self._gpu_memory = memory_used = value
self._gpu_memory_human = bytes2human(self.gpu_memory())
@ -666,8 +636,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
gpu_encoder_utilization: Optional[int] = None,
gpu_decoder_utilization: Optional[int] = None,
) -> None:
"""Sets the GPU utilization rates."""
"""Set the GPU utilization rates."""
# pylint: disable=attribute-defined-outside-init
if gpu_sm_utilization is not None:
self._gpu_sm_utilization = gpu_sm_utilization
@ -679,8 +648,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
self._gpu_decoder_utilization = gpu_decoder_utilization
def update_gpu_status(self) -> Union[int, NaType]:
"""Updates the GPU consumption status from a new NVML query."""
"""Update the GPU consumption status from a new NVML query."""
self.set_gpu_memory(NA)
self.set_gpu_utilization(NA, NA, NA, NA)
self.device.processes.cache_clear()
@ -697,7 +665,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
- :data:`'C+G'`: both compute context and graphics context
- :data:`'N/A'`: not applicable
"""
return self._type
@type.setter
@ -713,8 +680,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
@auto_garbage_clean(fallback=False)
def is_running(self) -> bool:
"""Returns whether this process is running."""
"""Return whether this process is running."""
return self.host.is_running()
@auto_garbage_clean(fallback='terminated')
@ -731,7 +697,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.status()
@auto_garbage_clean(fallback=NA)
@ -748,7 +713,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.create_time()
@auto_garbage_clean(fallback=NA)
@ -765,7 +729,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.running_time()
def running_time_human(self) -> Union[str, NaType]:
@ -781,7 +744,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return timedelta2human(self.running_time())
def running_time_in_seconds(self) -> Union[float, NaType]:
@ -797,7 +759,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
running_time = self.running_time()
if running_time is NA:
return NA
@ -821,7 +782,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
if self._username is None: # pylint: disable=access-member-before-definition
self._username = self.host.username() # pylint: disable=attribute-defined-outside-init
return self._username
@ -840,12 +800,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.name()
@auto_garbage_clean(fallback=NA)
def cpu_percent(self) -> Union[float, NaType]: # in percentage
"""Returns a float representing the current process CPU utilization as a percentage.
"""Return a float representing the current process CPU utilization as a percentage.
Raises:
host.NoSuchProcess:
@ -857,13 +816,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.cpu_percent()
@auto_garbage_clean(fallback=NA)
def memory_percent(self) -> Union[float, NaType]: # in percentage
"""Compares process RSS memory to total physical system memory
and calculate process memory utilization as a percentage.
"""Compare process RSS memory to total physical system memory and calculate process memory utilization as a percentage.
Raises:
host.NoSuchProcess:
@ -874,8 +831,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
Note:
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
""" # pylint: disable=line-too-long
return self.host.memory_percent()
host_memory_percent = memory_percent # in percentage
@ -894,7 +850,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return self.host.rss_memory()
def host_memory_human(self) -> Union[str, NaType]:
@ -910,7 +865,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return bytes2human(self.host_memory())
rss_memory = host_memory # in bytes
@ -930,14 +884,13 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
cmdline = self.host.cmdline()
if len(cmdline) == 0 and not self._gone:
cmdline = ['Zombie Process']
return cmdline
def command(self) -> str:
"""Returns a shell-escaped string from command line arguments.
"""Return a shell-escaped string from command line arguments.
Raises:
host.NoSuchProcess:
@ -949,13 +902,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
To return the fallback value rather than raise an exception, please use the context
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
return command_join(self.cmdline())
@auto_garbage_clean(fallback=_RAISE)
def host_snapshot(self) -> Snapshot:
"""Returns a onetime snapshot of the host process."""
"""Return a onetime snapshot of the host process."""
with self.host.oneshot():
host_snapshot = Snapshot(
real=self.host,
@ -980,7 +931,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
def as_snapshot(
self, *, host_process_snapshot_cache: Optional[Dict[int, Snapshot]] = None
) -> Snapshot:
"""Returns a onetime snapshot of the process on the GPU device.
"""Return a onetime snapshot of the process on the GPU device.
Note:
To return the fallback value rather than raise an exception, please use the context
@ -988,7 +939,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
snapshots with :meth:`GpuProcess.take_snapshots`, which caches the results and reduces
redundant queries. See also :meth:`take_snapshots` and :meth:`failsafe`.
"""
host_process_snapshot_cache = host_process_snapshot_cache or {}
try:
host_snapshot = host_process_snapshot_cache[self.pid]
@ -1031,12 +981,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
def take_snapshots( # batched version of `as_snapshot`
cls, gpu_processes: Iterable['GpuProcess'], *, failsafe=False
) -> List[Snapshot]:
"""Takes snapshots for a list of :class:`GpuProcess` instances.
"""Take snapshots for a list of :class:`GpuProcess` instances.
If *failsafe* is :data:`True`, then if any method fails, the fallback value in
:func:`auto_garbage_clean` will be used.
"""
cache = {}
context = cls.failsafe if failsafe else contextlib.nullcontext
with context():
@ -1052,7 +1001,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
"""A context manager that enables fallback values for methods that fail.
Examples:
>>> p = GpuProcess(pid=10000, device=Device(0)) # process does not exist
>>> p
GpuProcess(pid=10000, gpu_memory=N/A, type=N/A, device=PhysicalDevice(index=0, name="NVIDIA GeForce RTX 3070", total_memory=8192MiB), host=HostProcess(pid=10000, status='terminated'))
@ -1070,7 +1018,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
fallback (float cast): nan
fallback (int cast): 0
""" # pylint: disable=line-too-long
global _USE_FALLBACK_WHEN_RAISE # pylint: disable=global-statement,global-variable-not-assigned
prev_value = getattr(_USE_FALLBACK_WHEN_RAISE, 'value', False)

View file

@ -76,8 +76,7 @@ COLOR = sys.stdout.isatty()
def set_color(value: bool) -> None:
"""Force enables text coloring."""
"""Force enable text coloring."""
global COLOR # pylint: disable=global-statement
COLOR = bool(value)
@ -88,7 +87,7 @@ def colored(
on_color: Optional[str] = None,
attrs: Iterable[str] = None,
) -> str:
"""Colorizes text.
"""Colorize text with ANSI color escape codes.
Available text colors:
red, green, yellow, blue, magenta, cyan, white.
@ -100,11 +99,9 @@ def colored(
bold, dark, underline, blink, reverse, concealed.
Examples:
>>> colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
>>> colored('Hello, World!', 'green')
"""
if COLOR:
return _colored(text, color=color, on_color=on_color, attrs=attrs)
return text
@ -114,11 +111,10 @@ class NaType(str):
"""A singleton (:const:`str: 'N/A'`) class represents a not applicable value.
The :const:`NA` instance behaves like a :class:`str` instance (:const:`'N/A'`) when doing string
manipulation (e.g. concatenation). For arithmetic operations, for example :code:`NA / 1024 / 1024`,
manipulation (e.g. concatenation). For arithmetic operations, for example ``NA / 1024 / 1024``,
it acts like the :data:`math.nan`.
Examples:
>>> NA
'N/A'
@ -142,45 +138,42 @@ class NaType(str):
"""
def __new__(cls) -> 'NaType':
"""Gets the singleton instance (:const:`nvitop.NA`)."""
"""Get the singleton instance (:const:`nvitop.NA`)."""
if not hasattr(cls, '_instance'):
cls._instance = super().__new__(cls, 'N/A')
return cls._instance
def __bool__(self) -> bool:
"""Converts :const:`NA` to :class:`bool`. Returns :data:`False`.
"""Convert :const:`NA` to :class:`bool` and return :data:`False`.
>>> bool(NA)
False
"""
return False
def __int__(self) -> int:
"""Converts :const:`NA` to :class:`int`. Returns :const:`0`.
"""Convert :const:`NA` to :class:`int` and return :const:`0`.
>>> int(NA)
0
"""
return 0
def __float__(self) -> float:
"""Converts :const:`NA` to :class:`float`. Returns :data:`math.nan`.
"""Convert :const:`NA` to :class:`float` and return :data:`math.nan`.
>>> float(NA)
nan
>>> float(NA) is math.nan
True
"""
return math.nan
def __add__(self, other: object) -> Union[str, float]:
""":const:`nvitop.NA` + other: Returns :data:`math.nan` if the operand is a number or uses
string concatenation if the operand is a string. A special case is when the operand is
:const:`nvitop.NA` itself, the result is :data:`math.nan` instead of :const:`'N/AN/A'`.
"""Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``NA + other``).
A special case is when the operand is :const:`nvitop.NA` itself, the result is
:data:`math.nan` instead of :const:`'N/AN/A'`.
>>> NA + ' str'
'N/A str'
@ -190,14 +183,13 @@ class NaType(str):
nan
>>> NA + 1.0
nan
"""
""" # pylint: disable=line-too-long
if isinstance(other, (int, float)) or other is NA:
return float(self) + other
return super().__add__(other)
def __radd__(self, other: object) -> Union[str, float]:
"""other + :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``other + NA``).
>>> 'str' + NA
'strN/A'
@ -205,14 +197,13 @@ class NaType(str):
nan
>>> 1.0 + NA
nan
"""
""" # pylint: disable=line-too-long
if isinstance(other, (int, float)):
return other + float(self)
return NotImplemented
def __sub__(self, other: object) -> float:
""":const:`nvitop.NA` - other: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``NA - other``).
>>> NA - 'str'
TypeError: unsupported operand type(s) for -: 'NaType' and 'str'
@ -223,13 +214,12 @@ class NaType(str):
>>> NA + 1.0
nan
"""
if isinstance(other, (int, float)) or other is NA:
return float(self) - other
return NotImplemented
def __rsub__(self, other: object) -> float:
"""other - :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``other - NA``).
>>> 'str' - NA
TypeError: unsupported operand type(s) for -: 'str' and 'NaType'
@ -238,14 +228,14 @@ class NaType(str):
>>> 1.0 - NA
nan
"""
if isinstance(other, (int, float)):
return other - float(self)
return NotImplemented
def __mul__(self, other: object) -> float:
""":const:`nvitop.NA` * other: Returns :data:`math.nan` if the operand is a number. A special
case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`.
"""Return :data:`math.nan` if the operand is a number (``NA * other``).
A special case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`.
>>> NA * 1024
nan
@ -254,26 +244,24 @@ class NaType(str):
>>> NA * NA
nan
"""
if isinstance(other, (int, float)) or other is NA:
return float(self) * other
return NotImplemented
def __rmul__(self, other: object) -> float:
"""other * :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``other * NA``).
>>> 1024 * NA
nan
>>> 1024.0 * NA
nan
"""
if isinstance(other, (int, float)):
return other * float(self)
return NotImplemented
def __truediv__(self, other: object) -> float:
""":const:`nvitop.NA` / other: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``NA / other``).
>>> NA / 1024
nan
@ -284,26 +272,24 @@ class NaType(str):
>>> NA / 0.0
ZeroDivisionError: float division by zero
"""
if isinstance(other, (int, float)):
return float(self) / other
return NotImplemented
def __rtruediv__(self, other: object) -> float:
"""other / :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``other / NA``).
>>> 1024 / NA
nan
>>> 1024.0 / NA
nan
"""
if isinstance(other, (int, float)):
return other / float(self)
return NotImplemented
def __floordiv__(self, other: object) -> float:
""":const:`nvitop.NA` // other: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``NA // other``).
>>> NA // 1024
nan
@ -314,26 +300,24 @@ class NaType(str):
>>> NA / 0.0
ZeroDivisionError: float division by zero
"""
if isinstance(other, (int, float)):
return float(self) // other
return NotImplemented
def __rfloordiv__(self, other: object) -> float:
"""other // :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``other // NA``).
>>> 1024 // NA
nan
>>> 1024.0 // NA
nan
"""
if isinstance(other, (int, float)):
return other // float(self)
return NotImplemented
def __mod__(self, other: object) -> float:
""":const:`nvitop.NA` % other: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``NA % other``).
>>> NA % 1024
nan
@ -344,26 +328,24 @@ class NaType(str):
>>> NA % 0.0
ZeroDivisionError: float modulo
"""
if isinstance(other, (int, float)):
return float(self) % other
return NotImplemented
def __rmod__(self, other: object) -> float:
"""other % :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
"""Return :data:`math.nan` if the operand is a number (``other % NA``).
>>> 1024 % NA
nan
>>> 1024.0 % NA
nan
"""
if isinstance(other, (int, float)):
return other % float(self)
return NotImplemented
def __divmod__(self, other: object) -> Tuple[float, float]:
"""divmod(:const:`nvitop.NA`, other): The pair (:const:`nvitop.NA` // other, :const:`nvitop.NA` % other).
"""The pair ``(NA // other, NA % other)`` (``divmod(NA, other)``).
>>> divmod(NA, 1024)
(nan, nan)
@ -374,49 +356,44 @@ class NaType(str):
>>> divmod(NA, 0.0)
ZeroDivisionError: float floor division by zero
"""
return (self // other, self % other)
def __rdivmod__(self, other: object) -> Tuple[float, float]:
"""divmod(other, :const:`nvitop.NA`): The pair (other // :const:`nvitop.NA`, other % :const:`nvitop.NA`).
"""The pair ``(other // NA, other % NA)`` (``divmod(other, NA)``).
>>> divmod(1024, NA)
(nan, nan)
>>> divmod(1024.0, NA)
(nan, nan)
"""
return (other // self, other % self)
def __pos__(self) -> float:
"""+:const:`nvitop.NA`: Returns :data:`math.nan`.
"""Return :data:`math.nan` (``+NA``).
>>> +NA
nan
"""
return +float(self)
def __neg__(self) -> float:
"""+:const:`nvitop.NA`: Returns :data:`math.nan`.
"""Return :data:`math.nan` (``-NA``).
>>> -NA
nan
"""
return -float(self)
def __abs__(self) -> float:
"""abs(NA): Returns :data:`math.nan`.
"""Return :data:`math.nan` (``abs(NA)``).
>>> abs(NA)
nan
"""
return abs(float(self))
def __round__(self, ndigits: Optional[int] = None) -> Union[int, float]:
"""Rounds :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`.
"""Round :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`.
If ``ndigits`` is omitted or :data:`None`, returns :const:`0`, otherwise returns :data:`math.nan`.
@ -427,40 +404,36 @@ class NaType(str):
>>> round(NA, 1)
nan
"""
if ndigits is None:
return int(self)
return round(float(self), ndigits)
def __lt__(self, x: object) -> bool:
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
if isinstance(x, (int, float)):
return False
return super().__lt__(x)
def __le__(self, x: object) -> bool:
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
if isinstance(x, (int, float)):
return False
return super().__le__(x)
def __gt__(self, x: object) -> bool:
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
if isinstance(x, (int, float)):
return True
return super().__gt__(x)
def __ge__(self, x: object) -> bool:
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
if isinstance(x, (int, float)):
return True
return super().__ge__(x)
def __format__(self, format_spec: str) -> str:
"""Format :const:`nvitop.NA` according to ``format_spec``."""
try:
return super().__format__(format_spec)
except ValueError:
@ -515,8 +488,7 @@ SIZE_PATTERN = re.compile(
def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-many-return-statements
"""Converts bytes to a human readable string."""
"""Convert bytes to a human readable string."""
if b == NA:
return NA
@ -546,14 +518,13 @@ def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-man
def human2bytes(s: Union[int, str]) -> int:
"""Converts a human readable size string (*case insensitive*) to bytes.
"""Convert a human readable size string (*case insensitive*) to bytes.
Raises:
ValueError:
If cannot convert the given size string.
Examples:
>>> human2bytes('500B')
500
>>> human2bytes('10k')
@ -567,7 +538,6 @@ def human2bytes(s: Union[int, str]) -> int:
>>> human2bytes('1.5GiB')
1610612736
"""
if isinstance(s, int):
if s >= 0:
return s
@ -582,8 +552,7 @@ def human2bytes(s: Union[int, str]) -> int:
def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str:
"""Converts a number in seconds or a :class:`datetime.timedelta` instance to a human readable string."""
"""Convert a number in seconds or a :class:`datetime.timedelta` instance to a human readable string."""
if isinstance(dt, (int, float)):
dt = datetime.timedelta(seconds=dt)
@ -600,8 +569,7 @@ def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str:
def utilization2string(utilization: Union[int, float, NaType]) -> str:
"""Converts a utilization rate to string."""
"""Convert a utilization rate to string."""
if utilization != NA:
if isinstance(utilization, int):
return f'{utilization}%'
@ -611,8 +579,7 @@ def utilization2string(utilization: Union[int, float, NaType]) -> str:
def boolify(string: str, default: Any = None) -> bool:
"""Converts the given value, usually a string, to boolean."""
"""Convert the given value, usually a string, to boolean."""
if string.lower() in ('true', 'yes', 'on', 'enabled', '1'):
return True
if string.lower() in ('false', 'no', 'off', 'disabled', '0'):
@ -624,6 +591,7 @@ def boolify(string: str, default: Any = None) -> bool:
class Snapshot:
"""A dict-like object holds the snapshot values.
The value can be accessed by ``snapshot.name`` or ``snapshot['name']`` syntax.
The Snapshot can also be converted to a dictionary by ``dict(snapshot)`` or ``{**snapshot}``.
@ -631,12 +599,14 @@ class Snapshot:
"""
def __init__(self, real: Any, **items) -> None:
"""Initialize a new :class:`Snapshot` object with the given attributes."""
self.real = real
self.timestamp = time.time()
for key, value in items.items():
setattr(self, key, value)
def __str__(self) -> str:
"""Return a string representation of the snapshot."""
keys = set(self.__dict__.keys()).difference({'real', 'timestamp'})
keys = ['real', *sorted(keys)]
keyvals = []
@ -653,13 +623,14 @@ class Snapshot:
__repr__ = __str__
def __hash__(self) -> int:
"""Return a hash value of the snapshot."""
return hash((self.real, self.timestamp))
def __getattr__(self, name: str) -> Any:
"""Gets a member from the instance.
"""Get a member from the instance.
If the attribute is not defined, fetches from the original object and makes a function call.
"""
try:
return super().__getattr__(name)
except AttributeError:
@ -671,20 +642,18 @@ class Snapshot:
return attribute
def __getitem__(self, name: str) -> Any:
"""Supports ``snapshot['name']`` syntax."""
"""Support ``snapshot['name']`` syntax."""
try:
return getattr(self, name)
except AttributeError as ex:
raise KeyError(name) from ex
def __setitem__(self, name: str, value: Any) -> None:
"""Supports ``snapshot['name'] = value`` syntax."""
"""Support ``snapshot['name'] = value`` syntax."""
setattr(self, name, value)
def __iter__(self) -> Iterable[str]:
"""Supports ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax."""
"""Support ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax."""
def gen() -> str:
for name in self.__dict__:
@ -694,18 +663,17 @@ class Snapshot:
return gen()
def keys(self) -> Iterable[str]:
"""Supports `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and
``dict(snapshot)`` dictionary conversion.
"""
# pylint: disable-next=line-too-long
"""Support `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and ``dict(snapshot)`` dictionary conversion."""
return iter(self)
# Modified from psutil (https://github.com/giampaolo/psutil)
def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]:
"""A memoize decorator which is disabled by default. It can be activated and
deactivated on request. For efficiency reasons it can be used only against
class methods accepting no arguments.
"""A memoize decorator which is disabled by default.
It can be activated and deactivated on request. For efficiency reasons it can be used only
against class methods accepting no arguments.
"""
@functools.wraps(method)
@ -729,10 +697,10 @@ def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]
return ret
def cache_activate(self):
"""Activate cache. Expects a Process instance. Cache will be stored as
a "_cache" instance attribute.
"""
"""Activate cache.
Expects an instance. Cache will be stored as a "_cache" instance attribute.
"""
if not hasattr(self, '_cache'):
setattr(self, '_cache', {})

View file

@ -54,8 +54,6 @@ Python API:
)
""" # pylint: disable=line-too-long
# pylint: disable=missing-function-docstring
import argparse
import getpass
import math
@ -95,15 +93,13 @@ def select_devices(
sort: bool = True,
**kwargs, # fmt: skip # pylint: disable=unused-argument
) -> Union[List[int], List[Tuple[int, int]], List[str]]:
"""Selected a subset of devices satisfying the specified criteria. Returns a list of the device
identifiers.
"""Select a subset of devices satisfying the specified criteria.
Note:
The *min count* constraint may not be satisfied if the no enough devices are available. This
constraint is only enforced when there are both MIG and non-MIG devices present.
Examples:
Put the following lines to the top of your script:
.. code-block:: python
@ -144,8 +140,10 @@ def select_devices(
A list of accounts whose used GPU memory needs be considered as free memory.
sort (bool):
If :data:`True`, sort the selected devices by memory usage and GPU utilization.
"""
Returns:
A list of the device identifiers.
"""
assert format in ('index', 'uuid', 'device')
assert tolerance >= 0
tolerance = tolerance / 100.0
@ -274,6 +272,8 @@ def select_devices(
def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
"""Parse command-line arguments for ``nvisel``."""
def non_negint(argstring):
num = int(argstring)
if num < 0:
@ -490,6 +490,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
def main():
"""Main function for ``nvisel`` CLI."""
args = parse_arguments()
try:

View file

@ -9,7 +9,7 @@
# pip install 'nvitop[pynvml-xx.yyy.zz]'
#
# pylint: disable=missing-module-docstring
"""Setup script for ``nvitop``."""
import pathlib
import re