mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-21 06:45:24 -06:00
docs(core): rephrase docstrings
Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
parent
7aa2f068c1
commit
cb84fa8197
17 changed files with 486 additions and 680 deletions
|
|
@ -132,3 +132,4 @@ lol
|
|||
xx
|
||||
yyy
|
||||
zz
|
||||
CLI
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
# License: GNU GPL version 3.
|
||||
|
||||
# pylint: disable=missing-module-docstring
|
||||
"""The interactive NVIDIA-GPU process viewer."""
|
||||
|
||||
import sys
|
||||
|
||||
|
|
|
|||
|
|
@ -32,28 +32,25 @@ from nvitop.core import libnvml
|
|||
|
||||
# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
|
||||
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
||||
r"""
|
||||
Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a
|
||||
callback and in order to use it you need to assign a TensorBoard callback or
|
||||
a CSVLogger callback to the model.
|
||||
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
|
||||
in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.
|
||||
|
||||
Args:
|
||||
memory_utilization (bool):
|
||||
Set to :data:`True` to log used, free and the percentage of memory
|
||||
utilization at the start and end of each step. Default: :data:`True`.
|
||||
Set to :data:`True` to log used, free and the percentage of memory utilization at the
|
||||
start and end of each step. Default: :data:`True`.
|
||||
gpu_utilization (bool):
|
||||
Set to :data:`True` to log the percentage of GPU utilization
|
||||
at the start and end of each step. Default: :data:`True`.
|
||||
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
|
||||
each step. Default: :data:`True`.
|
||||
intra_step_time (bool):
|
||||
Set to :data:`True` to log the time of each step. Default: :data:`False`.
|
||||
inter_step_time (bool):
|
||||
Set to :data:`True` to log the time between the end of one step
|
||||
and the start of the next step. Default: :data:`False`.
|
||||
Set to :data:`True` to log the time between the end of one step and the start of the
|
||||
next step. Default: :data:`False`.
|
||||
fan_speed (bool):
|
||||
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
|
||||
temperature (bool):
|
||||
Set to :data:`True` to log the gpu temperature in degree Celsius.
|
||||
Default: :data:`False`.
|
||||
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
|
||||
|
||||
Raises:
|
||||
ValueError:
|
||||
|
|
@ -77,16 +74,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
|
||||
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
|
||||
|
||||
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently
|
||||
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
|
||||
If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
|
||||
Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
|
||||
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
|
||||
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
|
||||
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
|
||||
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
|
||||
via fans in the surrounding enclosure.
|
||||
- **memory.used** - Total memory allocated by active contexts, in MiBs.
|
||||
- **memory.free** - Total free memory, in MiBs.
|
||||
- **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was
|
||||
executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
|
||||
- **utilization.memory** - Percent of time over the past sample period during which global (device) memory was
|
||||
being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
|
||||
- **utilization.gpu** - Percent of time over the past sample period during which one or more
|
||||
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
|
||||
depending on the product.
|
||||
- **utilization.memory** - Percent of time over the past sample period during which global
|
||||
(device) memory was being read or written. The sample period may be between 1 second and 1/6
|
||||
second depending on the product.
|
||||
- **temperature** - Core GPU temperature, in degrees C.
|
||||
"""
|
||||
|
||||
|
|
@ -167,8 +167,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
)
|
||||
|
||||
def _get_gpu_stats(self) -> Dict[str, float]:
|
||||
"""Get the gpu status from NVML queries"""
|
||||
|
||||
"""Get the gpu status from NVML queries."""
|
||||
return get_gpu_stats(
|
||||
devices=self._devices,
|
||||
memory_utilization=self._memory_utilization,
|
||||
|
|
|
|||
|
|
@ -33,27 +33,25 @@ from nvitop.core import libnvml
|
|||
|
||||
# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
|
||||
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
||||
r"""
|
||||
Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a
|
||||
callback and in order to use it you need to assign a logger in the ``Trainer``.
|
||||
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
|
||||
in order to use it you need to assign a logger in the ``Trainer``.
|
||||
|
||||
Args:
|
||||
memory_utilization (bool):
|
||||
Set to :data:`True` to log used, free and the percentage of memory
|
||||
utilization at the start and end of each step. Default: :data:`True`.
|
||||
Set to :data:`True` to log used, free and the percentage of memory utilization at the
|
||||
start and end of each step. Default: :data:`True`.
|
||||
gpu_utilization (bool):
|
||||
Set to :data:`True` to log the percentage of GPU utilization
|
||||
at the start and end of each step. Default: :data:`True`.
|
||||
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
|
||||
each step. Default: :data:`True`.
|
||||
intra_step_time (bool):
|
||||
Set to :data:`True` to log the time of each step. Default: :data:`False`.
|
||||
inter_step_time (bool):
|
||||
Set to :data:`True` to log the time between the end of one step
|
||||
and the start of the next step. Default: :data:`False`.
|
||||
Set to :data:`True` to log the time between the end of one step and the start of the
|
||||
next step. Default: :data:`False`.
|
||||
fan_speed (bool):
|
||||
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
|
||||
temperature (bool):
|
||||
Set to :data:`True` to log the gpu temperature in degree Celsius.
|
||||
Default: :data:`False`.
|
||||
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
|
||||
|
||||
Raises:
|
||||
MisconfigurationException:
|
||||
|
|
@ -68,16 +66,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
|
||||
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
|
||||
|
||||
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently
|
||||
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
|
||||
If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
|
||||
Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
|
||||
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
|
||||
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
|
||||
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
|
||||
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
|
||||
via fans in the surrounding enclosure.
|
||||
- **memory.used** - Total memory allocated by active contexts, in MiBs.
|
||||
- **memory.free** - Total free memory, in MiBs.
|
||||
- **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was
|
||||
executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
|
||||
- **utilization.memory** - Percent of time over the past sample period during which global (device) memory was
|
||||
being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
|
||||
- **utilization.gpu** - Percent of time over the past sample period during which one or more
|
||||
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
|
||||
depending on the product.
|
||||
- **utilization.memory** - Percent of time over the past sample period during which global
|
||||
(device) memory was being read or written. The sample period may be between 1 second and 1/6
|
||||
second depending on the product.
|
||||
- **temperature** - Core GPU temperature, in degrees C.
|
||||
"""
|
||||
|
||||
|
|
@ -161,8 +162,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
||||
|
||||
def _get_gpu_stats(self) -> Dict[str, float]:
|
||||
"""Get the gpu status from NVML queries"""
|
||||
|
||||
"""Get the gpu status from NVML queries."""
|
||||
return get_gpu_stats(
|
||||
devices=self._devices,
|
||||
memory_utilization=self._memory_utilization,
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
|
||||
def add_scalar_dict(writer, main_tag, tag_scalar_dict, global_step=None, walltime=None):
|
||||
"""Batched version of `writer.add_scalar`"""
|
||||
"""Add a batch of scalars to the writer.
|
||||
|
||||
Batched version of ``writer.add_scalar``.
|
||||
"""
|
||||
for tag, scalar in tag_scalar_dict.items():
|
||||
writer.add_scalar(f'{main_tag}/{tag}', scalar, global_step=global_step, walltime=walltime)
|
||||
|
|
|
|||
|
|
@ -43,8 +43,7 @@ def get_gpu_stats(
|
|||
fan_speed: bool = False,
|
||||
temperature: bool = False,
|
||||
) -> Dict[str, float]:
|
||||
"""Get the GPU status from NVML queries"""
|
||||
|
||||
"""Get the GPU status from NVML queries."""
|
||||
stats = {}
|
||||
for device in devices:
|
||||
prefix = f'gpu_id: {device.cuda_index}'
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
# License: GNU GPL version 3.
|
||||
|
||||
# pylint: disable=missing-module-docstring,missing-function-docstring
|
||||
"""The interactive NVIDIA-GPU process viewer."""
|
||||
|
||||
import argparse
|
||||
import curses
|
||||
|
|
@ -20,7 +20,7 @@ NVITOP_MONITOR_MODE = set(
|
|||
|
||||
|
||||
def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
||||
|
||||
"""Parse command-line arguments for ``nvtiop``."""
|
||||
coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format(
|
||||
colored('light', 'green'), colored('moderate', 'yellow'), colored('heavy', 'red')
|
||||
)
|
||||
|
|
@ -252,6 +252,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
|||
|
||||
|
||||
def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-locals
|
||||
"""Main function for ``nvitop`` CLI."""
|
||||
args = parse_arguments()
|
||||
|
||||
if args.force_color:
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
# pylint: disable=missing-module-docstring
|
||||
"""Resource metrics collectors."""
|
||||
|
||||
import contextlib
|
||||
import itertools
|
||||
|
|
@ -44,24 +44,27 @@ class SnapshotResult(NamedTuple): # pylint: disable=missing-class-docstring
|
|||
timer = time.monotonic
|
||||
|
||||
|
||||
def _unique(iterable: Iterable[Hashable]) -> List[Hashable]:
|
||||
return list(OrderedDict.fromkeys(iterable).keys())
|
||||
|
||||
|
||||
# pylint: disable-next=too-many-branches
|
||||
def take_snapshots(
|
||||
devices: Optional[Union[Device, Iterable[Device]]] = None,
|
||||
*,
|
||||
gpu_processes: Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]] = None,
|
||||
) -> SnapshotResult:
|
||||
"""Retrieves status of demanded devices and GPU processes.
|
||||
"""Retrieve status of demanded devices and GPU processes.
|
||||
|
||||
Args:
|
||||
devices (Optional[Union[Device, Iterable[Device]]]):
|
||||
Requested devices for snapshots. If not given, the devices will be
|
||||
determined from GPU processes:
|
||||
- All devices (no GPU processes are given)
|
||||
- Devices that used by given GPU processes
|
||||
Requested devices for snapshots. If not given, the devices will be determined from GPU
|
||||
processes: **(1)** All devices (no GPU processes are given); **(2)** Devices that used
|
||||
by given GPU processes.
|
||||
gpu_processes (Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]]):
|
||||
Requested GPU processes snapshots. If not given, all GPU processes
|
||||
running on the requested device will be returned. The GPU process
|
||||
snapshots can be suppressed by specifying ``gpu_processes=False``.
|
||||
Requested GPU processes snapshots. If not given, all GPU processes running on the
|
||||
requested device will be returned. The GPU process snapshots can be suppressed by
|
||||
specifying ``gpu_processes=False``.
|
||||
|
||||
Returns: SnapshotResult
|
||||
A named tuple containing two lists of snapshots.
|
||||
|
|
@ -71,7 +74,6 @@ def take_snapshots(
|
|||
be returned.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from nvitop import take_snapshots, Device
|
||||
>>> import os
|
||||
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '1,0'
|
||||
|
|
@ -136,10 +138,6 @@ def take_snapshots(
|
|||
]
|
||||
)
|
||||
""" # pylint: disable=line-too-long
|
||||
|
||||
def unique(iterable: Iterable[Hashable]) -> List[Hashable]:
|
||||
return list(OrderedDict.fromkeys(iterable).keys())
|
||||
|
||||
if isinstance(devices, Device):
|
||||
devices = [devices]
|
||||
if isinstance(gpu_processes, GpuProcess):
|
||||
|
|
@ -148,7 +146,7 @@ def take_snapshots(
|
|||
if gpu_processes is not None:
|
||||
if gpu_processes: # is not False or is a non-empty list/tuple
|
||||
gpu_processes = list(gpu_processes)
|
||||
process_devices = unique(process.device for process in gpu_processes)
|
||||
process_devices = _unique(process.device for process in gpu_processes)
|
||||
for device in process_devices:
|
||||
device.processes() # update GPU status for requested GPU processes
|
||||
if devices is None:
|
||||
|
|
@ -193,57 +191,55 @@ def collect_in_background(
|
|||
tag: str = 'metrics-daemon',
|
||||
start: bool = True,
|
||||
) -> threading.Thread:
|
||||
"""Starts a background daemon thread that collect and call the callback function periodically.
|
||||
"""Start a background daemon thread that collect and call the callback function periodically.
|
||||
|
||||
See also :func:`ResourceMetricCollector.daemonize`.
|
||||
|
||||
Args:
|
||||
on_collect: (Callable[[Dict[str, float]], bool])
|
||||
on_collect (Callable[[Dict[str, float]], bool]):
|
||||
A callback function that will be called periodically. It takes a dictionary containing
|
||||
the resource metrics and returns a boolean indicating whether to continue monitoring.
|
||||
collector: (Optional[ResourceMetricCollector])
|
||||
collector (Optional[ResourceMetricCollector]):
|
||||
A :class:`ResourceMetricCollector` instance to collect metrics. If not given, it will
|
||||
collect metrics for all GPUs and subprocess of the current process.
|
||||
interval: (Optional[float])
|
||||
interval (Optional[float]):
|
||||
The collect interval. If not given, use ``collector.interval``.
|
||||
on_start: (Optional[Callable[['ResourceMetricCollector'], None]])
|
||||
on_start (Optional[Callable[[ResourceMetricCollector], None]]):
|
||||
A function to initialize the daemon thread and collector.
|
||||
on_stop: (Optional[Callable[['ResourceMetricCollector'], None]])
|
||||
on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
|
||||
A function that do some necessary cleanup after the daemon thread is stopped.
|
||||
tag: (str)
|
||||
tag (str):
|
||||
The tag prefix used for metrics results.
|
||||
start: (bool)
|
||||
start (bool):
|
||||
Whether to start the daemon thread on return.
|
||||
|
||||
Returns: threading.Thread
|
||||
A daemon thread object.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
.. code-block:: python
|
||||
logger = ...
|
||||
|
||||
logger = ...
|
||||
def on_collect(metrics): # will be called periodically
|
||||
if logger.is_closed(): # closed manually by user
|
||||
return False
|
||||
logger.log(metrics)
|
||||
return True
|
||||
|
||||
def on_collect(metrics): # will be called periodically
|
||||
if logger.is_closed(): # closed manually by user
|
||||
return False
|
||||
logger.log(metrics)
|
||||
return True
|
||||
def on_stop(collector): # will be called only once at stop
|
||||
if not logger.is_closed():
|
||||
logger.close() # cleanup
|
||||
|
||||
def on_stop(collector): # will be called only once at stop
|
||||
if not logger.is_closed():
|
||||
logger.close() # cleanup
|
||||
|
||||
# Record metrics to the logger in background every 5 seconds.
|
||||
# It will collect 5-second mean/min/max for each metric.
|
||||
collect_in_background(
|
||||
on_collect,
|
||||
ResourceMetricCollector(Device.cuda.all()),
|
||||
interval=5.0,
|
||||
on_stop=on_stop,
|
||||
)
|
||||
# Record metrics to the logger in background every 5 seconds.
|
||||
# It will collect 5-second mean/min/max for each metric.
|
||||
collect_in_background(
|
||||
on_collect,
|
||||
ResourceMetricCollector(Device.cuda.all()),
|
||||
interval=5.0,
|
||||
on_stop=on_stop,
|
||||
)
|
||||
"""
|
||||
|
||||
if collector is None:
|
||||
collector = ResourceMetricCollector()
|
||||
if isinstance(interval, (int, float)) and interval > 0:
|
||||
|
|
@ -282,13 +278,13 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
|
||||
Args:
|
||||
devices (Iterable[Device]):
|
||||
Set of Device instances for logging. If not given, all physical
|
||||
devices on board will be used.
|
||||
Set of Device instances for logging. If not given, all physical devices on board will be
|
||||
used.
|
||||
root_pids (Set[int]):
|
||||
A set of PIDs, only the status of the descendant processes on the
|
||||
GPUs will be collected. If not given, the PID of the current process
|
||||
will be used.
|
||||
interval (float): The snapshot interval for background daemon thread.
|
||||
A set of PIDs, only the status of the descendant processes on the GPUs will be collected.
|
||||
If not given, the PID of the current process will be used.
|
||||
interval (float):
|
||||
The snapshot interval for background daemon thread.
|
||||
|
||||
Core methods:
|
||||
|
||||
|
|
@ -305,7 +301,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
collector.daemonize(on_collect_fn)
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import os
|
||||
>>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'
|
||||
|
||||
|
|
@ -398,6 +393,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
root_pids: Optional[Iterable[int]] = None,
|
||||
interval: Union[int, float] = 1.0,
|
||||
) -> None:
|
||||
"""Initialize the resource metric collector."""
|
||||
if isinstance(interval, (int, float)) and interval > 0:
|
||||
interval = float(interval)
|
||||
else:
|
||||
|
|
@ -440,15 +436,14 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
self._daemon_running = threading.Event()
|
||||
|
||||
def activate(self, tag: str) -> 'ResourceMetricCollector':
|
||||
"""Starts a new metric collection with the given tag.
|
||||
"""Start a new metric collection with the given tag.
|
||||
|
||||
Args:
|
||||
tag (str):
|
||||
The name of the new metric collection. The tag will be used to
|
||||
identify the metric collection. It must be a unique string.
|
||||
The name of the new metric collection. The tag will be used to identify the metric
|
||||
collection. It must be a unique string.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> collector = ResourceMetricCollector()
|
||||
|
||||
>>> collector.activate(tag='train') # key prefix -> 'train'
|
||||
|
|
@ -457,7 +452,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
>>> collector.deactivate() # the collector has been stopped
|
||||
>>> collector.activate(tag='test') # key prefix -> 'test'
|
||||
"""
|
||||
|
||||
with self._lock:
|
||||
if self._metric_buffer is None or tag not in self._tags:
|
||||
self._tags.add(tag)
|
||||
|
|
@ -477,11 +471,15 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
start = activate
|
||||
|
||||
def deactivate(self, tag: Optional[str] = None) -> 'ResourceMetricCollector':
|
||||
"""Stops the current collection with the given tag and remove all sub-tags.
|
||||
If the tag is not specified, deactivate the current active collection.
|
||||
For nested collections, the sub-collections will be deactivated as well.
|
||||
"""
|
||||
"""Stop the current collection with the given tag and remove all sub-tags.
|
||||
|
||||
If the tag is not specified, deactivate the current active collection. For nested
|
||||
collections, the sub-collections will be deactivated as well.
|
||||
|
||||
Args:
|
||||
tag (Optional[str]):
|
||||
The tag to deactivate. If :data:`None`, the current active collection will be used.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._metric_buffer is None:
|
||||
if tag is not None:
|
||||
|
|
@ -516,18 +514,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
|
||||
Args:
|
||||
tag (str):
|
||||
The name of the new metric collection. The tag will be used to
|
||||
identify the metric collection. It must be a unique string.
|
||||
The name of the new metric collection. The tag will be used to identify the metric
|
||||
collection. It must be a unique string.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> collector = ResourceMetricCollector()
|
||||
|
||||
>>> with collector.context(tag='train'): # key prefix -> 'train'
|
||||
... # Do something
|
||||
... collector.collect() # -> Dict[str, float]
|
||||
"""
|
||||
|
||||
try:
|
||||
self.activate(tag=tag)
|
||||
yield self
|
||||
|
|
@ -537,17 +533,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
__call__ = context # alias for `with collector(tag='<tag>')`
|
||||
|
||||
def clear(self, tag: Optional[str] = None) -> None:
|
||||
"""Resets the metric collection with the given tag. If the tag is not
|
||||
specified, reset the current active collection. For nested collections,
|
||||
"""Reset the metric collection with the given tag.
|
||||
|
||||
If the tag is not specified, reset the current active collection. For nested collections,
|
||||
the sub-collections will be reset as well.
|
||||
|
||||
Args:
|
||||
tag (Optional[str]):
|
||||
The tag to reset. If None, the current active collection
|
||||
will be reset.
|
||||
The tag to reset. If :data:`None`, the current active collection will be reset.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> collector = ResourceMetricCollector()
|
||||
|
||||
>>> with collector(tag='train'): # key prefix -> 'train'
|
||||
|
|
@ -564,7 +559,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
... with collector(tag='batch'): # key prefix -> 'train/batch'
|
||||
... collector.reset(tag='train') # reset both 'train' and 'train/batch'
|
||||
"""
|
||||
|
||||
with self._lock:
|
||||
if self._metric_buffer is None:
|
||||
if tag is not None:
|
||||
|
|
@ -586,8 +580,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
buffer = buffer.prev
|
||||
|
||||
def collect(self) -> Dict[str, float]:
|
||||
"""Gets the average resource consumption during collection."""
|
||||
|
||||
"""Get the average resource consumption during collection."""
|
||||
with self._lock:
|
||||
if self._metric_buffer is None:
|
||||
raise RuntimeError('Resource metric collector has not been not started yet.')
|
||||
|
|
@ -607,52 +600,51 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
tag: str = 'metrics-daemon',
|
||||
start: bool = True,
|
||||
) -> threading.Thread:
|
||||
"""Starts a background daemon thread that collect and call the callback function periodically.
|
||||
"""Start a background daemon thread that collect and call the callback function periodically.
|
||||
|
||||
See also :func:`collect_in_background`.
|
||||
|
||||
Args:
|
||||
on_collect: (Callable[[Dict[str, float]], bool])
|
||||
on_collect (Callable[[Dict[str, float]], bool]):
|
||||
A callback function that will be called periodically. It takes a dictionary containing
|
||||
the resource metrics and returns a boolean indicating whether to continue monitoring.
|
||||
interval: (Optional[float])
|
||||
interval (Optional[float]):
|
||||
The collect interval. If not given, use ``collector.interval``.
|
||||
on_start: (Optional[Callable[['ResourceMetricCollector'], None]])
|
||||
on_start (Optional[Callable[[ResourceMetricCollector], None]]):
|
||||
A function to initialize the daemon thread and collector.
|
||||
on_stop: (Optional[Callable[['ResourceMetricCollector'], None]])
|
||||
on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
|
||||
A function that do some necessary cleanup after the daemon thread is stopped.
|
||||
tag: (str)
|
||||
tag (str):
|
||||
The tag prefix used for metrics results.
|
||||
start: (bool)
|
||||
start (bool):
|
||||
Whether to start the daemon thread on return.
|
||||
|
||||
Returns: threading.Thread
|
||||
A daemon thread object.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
.. code-block:: python
|
||||
logger = ...
|
||||
|
||||
logger = ...
|
||||
def on_collect(metrics): # will be called periodically
|
||||
if logger.is_closed(): # closed manually by user
|
||||
return False
|
||||
logger.log(metrics)
|
||||
return True
|
||||
|
||||
def on_collect(metrics): # will be called periodically
|
||||
if logger.is_closed(): # closed manually by user
|
||||
return False
|
||||
logger.log(metrics)
|
||||
return True
|
||||
def on_stop(collector): # will be called only once at stop
|
||||
if not logger.is_closed():
|
||||
logger.close() # cleanup
|
||||
|
||||
def on_stop(collector): # will be called only once at stop
|
||||
if not logger.is_closed():
|
||||
logger.close() # cleanup
|
||||
|
||||
# Record metrics to the logger in background every 5 seconds.
|
||||
# It will collect 5-second mean/min/max for each metric.
|
||||
ResourceMetricCollector(Device.cuda.all()).daemonize(
|
||||
on_collect,
|
||||
ResourceMetricCollector(Device.cuda.all()),
|
||||
interval=5.0,
|
||||
on_stop=on_stop,
|
||||
)
|
||||
# Record metrics to the logger in background every 5 seconds.
|
||||
# It will collect 5-second mean/min/max for each metric.
|
||||
ResourceMetricCollector(Device.cuda.all()).daemonize(
|
||||
on_collect,
|
||||
ResourceMetricCollector(Device.cuda.all()),
|
||||
interval=5.0,
|
||||
on_stop=on_stop,
|
||||
)
|
||||
"""
|
||||
return collect_in_background(
|
||||
on_collect,
|
||||
|
|
@ -665,10 +657,12 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes
|
|||
)
|
||||
|
||||
def __del__(self) -> None:
|
||||
"""Clean up the demon thread on destruction."""
|
||||
self._daemon_running.clear()
|
||||
|
||||
# pylint: disable-next=missing-function-docstring,too-many-branches,too-many-locals,too-many-statements
|
||||
# pylint: disable-next=too-many-branches,too-many-locals,too-many-statements
|
||||
def take_snapshots(self) -> SnapshotResult:
|
||||
"""Take snapshots of the current resource metrics and update the metric buffer."""
|
||||
if len(self.root_pids) > 0:
|
||||
all_gpu_processes = []
|
||||
for device in self.leaf_devices:
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -16,8 +16,8 @@
|
|||
# ==============================================================================
|
||||
"""Shortcuts for package ``psutil``.
|
||||
|
||||
psutil is a cross-platform library for retrieving information on running processes
|
||||
and system utilization (CPU, memory, disks, network, sensors) in Python.
|
||||
``psutil`` is a cross-platform library for retrieving information on running processes and system
|
||||
utilization (CPU, memory, disks, network, sensors) in Python.
|
||||
"""
|
||||
|
||||
import os as _os
|
||||
|
|
@ -50,31 +50,30 @@ swap_memory = _ttl_cache(ttl=0.25)(_psutil.swap_memory)
|
|||
|
||||
try:
|
||||
load_average = _ttl_cache(ttl=2.0)(_psutil.getloadavg)
|
||||
load_average.__doc__ = """Get the system load average."""
|
||||
except AttributeError:
|
||||
|
||||
def load_average(): # pylint: disable=missing-function-docstring
|
||||
def load_average():
|
||||
"""Get the system load average."""
|
||||
return None
|
||||
|
||||
|
||||
def memory_percent():
|
||||
"""The percentage usage of virtual memory, calculated as (total - available) / total * 100."""
|
||||
|
||||
"""The percentage usage of virtual memory, calculated as ``(total - available) / total * 100``."""
|
||||
return virtual_memory().percent
|
||||
|
||||
|
||||
def swap_percent():
|
||||
"""The percentage usage of virtual memory, calculated as used / total * 100."""
|
||||
|
||||
"""The percentage usage of virtual memory, calculated as ``used / total * 100``."""
|
||||
return swap_memory().percent
|
||||
|
||||
|
||||
ppid_map = _psutil._ppid_map # pylint: disable=protected-access
|
||||
"""Obtains a ``{pid: ppid, ...}`` dict for all running processes in one shot."""
|
||||
"""Obtain a ``{pid: ppid, ...}`` dict for all running processes in one shot."""
|
||||
|
||||
|
||||
def reverse_ppid_map(): # pylint: disable=function-redefined
|
||||
"""Obtains a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot."""
|
||||
|
||||
"""Obtain a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot."""
|
||||
from collections import defaultdict # pylint: disable=import-outside-toplevel
|
||||
|
||||
tree = defaultdict(list)
|
||||
|
|
|
|||
|
|
@ -29,11 +29,11 @@ from typing import Type as _Type
|
|||
|
||||
|
||||
# pylint: disable-next=missing-class-docstring,too-few-public-methods
|
||||
class struct_c_CUdevice_t(_ctypes.Structure):
|
||||
class _struct_c_CUdevice_t(_ctypes.Structure):
|
||||
pass # opaque handle
|
||||
|
||||
|
||||
c_CUdevice_t = _ctypes.POINTER(struct_c_CUdevice_t)
|
||||
_c_CUdevice_t = _ctypes.POINTER(_struct_c_CUdevice_t)
|
||||
|
||||
_CUresult_t = _ctypes.c_uint
|
||||
|
||||
|
|
@ -229,8 +229,7 @@ class CUDAError(Exception):
|
|||
_errcode_to_name = {}
|
||||
|
||||
def __new__(cls, value: int) -> 'CUDAError':
|
||||
"""Maps value to a proper subclass of :class:`CUDAError`."""
|
||||
|
||||
"""Map value to a proper subclass of :class:`CUDAError`."""
|
||||
if cls is CUDAError:
|
||||
# pylint: disable-next=self-cls-assignment
|
||||
cls = CUDAError._value_class_mapping.get(value, cls)
|
||||
|
|
@ -239,6 +238,7 @@ class CUDAError(Exception):
|
|||
return obj
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the error."""
|
||||
# pylint: disable=no-member
|
||||
try:
|
||||
if self.value not in CUDAError._errcode_to_string:
|
||||
|
|
@ -255,30 +255,32 @@ class CUDAError(Exception):
|
|||
except CUDAError:
|
||||
return f'CUDA Error with code {self.value}.'
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""Test equality to other object."""
|
||||
if not isinstance(other, CUDAError):
|
||||
return NotImplemented
|
||||
return self.value == other.value # pylint: disable=no-member
|
||||
|
||||
def __reduce__(self) -> _Tuple[_Type['CUDAError'], _Tuple[int]]:
|
||||
"""Return state information for pickling."""
|
||||
return CUDAError, (self.value,) # pylint: disable=no-member
|
||||
|
||||
|
||||
def cudaExceptionClass(cudaErrorCode: int) -> _Type[CUDAError]:
|
||||
"""Maps value to a proper subclass of :class:`CUDAError`.
|
||||
"""Map value to a proper subclass of :class:`CUDAError`.
|
||||
|
||||
Raises:
|
||||
ValueError: If the error code is not valid.
|
||||
"""
|
||||
|
||||
# pylint: disable=protected-access
|
||||
if cudaErrorCode not in CUDAError._value_class_mapping:
|
||||
if cudaErrorCode not in CUDAError._value_class_mapping: # pylint: disable=protected-access
|
||||
raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.')
|
||||
return CUDAError._value_class_mapping[cudaErrorCode]
|
||||
return CUDAError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access
|
||||
|
||||
|
||||
def _extract_cuda_errors_as_classes() -> None:
|
||||
"""Generates a hierarchy of classes on top of :class:`CUDAError` class.
|
||||
"""Generate a hierarchy of classes on top of :class:`CUDAError` class.
|
||||
|
||||
Each CUDA Error gets a new :class:`CUDAError` subclass. This way try-except blocks can filter
|
||||
appropriate exceptions more easily.
|
||||
|
|
@ -286,7 +288,6 @@ def _extract_cuda_errors_as_classes() -> None:
|
|||
:class:`CUDAError` is a parent class. Each ``CUDA_ERROR_*`` gets it's own subclass.
|
||||
e.g. :data:`CUDA_ERROR_INVALID_VALUE` will be turned into :class:`CUDAError_InvalidValue`.
|
||||
"""
|
||||
|
||||
this_module = _sys.modules[__name__]
|
||||
cuda_error_names = [x for x in dir(this_module) if x.startswith('CUDA_ERROR_')]
|
||||
for err_name in cuda_error_names:
|
||||
|
|
@ -339,8 +340,7 @@ __cudaGetFunctionPointer_cache = {}
|
|||
|
||||
|
||||
def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
||||
"""
|
||||
Get the function pointer from the CUDA driver library.
|
||||
"""Get the function pointer from the CUDA driver library.
|
||||
|
||||
Raises:
|
||||
CUDAError_NotInitialized:
|
||||
|
|
@ -348,7 +348,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
|||
CUDAError_NotFound:
|
||||
If cannot found the function pointer.
|
||||
"""
|
||||
|
||||
if name in __cudaGetFunctionPointer_cache:
|
||||
return __cudaGetFunctionPointer_cache[name]
|
||||
|
||||
|
|
@ -364,14 +363,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
|||
|
||||
|
||||
def __LoadCudaLibrary() -> None:
|
||||
"""
|
||||
Load the library if it isn't loaded already.
|
||||
"""Load the library if it isn't loaded already.
|
||||
|
||||
Raises:
|
||||
CUDAError_NotInitialized:
|
||||
If cannot found the CUDA driver library.
|
||||
"""
|
||||
|
||||
global __cudaLib # pylint: disable=global-statement
|
||||
|
||||
if __cudaLib is None:
|
||||
|
|
@ -409,7 +406,7 @@ def __LoadCudaLibrary() -> None:
|
|||
def cuInit(flags: int = 0) -> None:
|
||||
"""Initialize the CUDA driver API.
|
||||
|
||||
Initializes the driver API and must be called before any other function from the driver API.
|
||||
Initialize the driver API and must be called before any other function from the driver API.
|
||||
Currently, the ``flags`` parameter must be :data:`0`. If :func:`cuInit` has not been called,
|
||||
any function from the driver API will return :data:`CUDA_ERROR_NOT_INITIALIZED`.
|
||||
|
||||
|
|
@ -429,7 +426,6 @@ def cuInit(flags: int = 0) -> None:
|
|||
CUDAError_NotInitialized:
|
||||
If cannot found the CUDA driver library.
|
||||
"""
|
||||
|
||||
global __initialized # pylint: disable=global-statement
|
||||
|
||||
__LoadCudaLibrary()
|
||||
|
|
@ -447,7 +443,7 @@ def cuInit(flags: int = 0) -> None:
|
|||
|
||||
|
||||
def cuGetErrorName(error: int) -> str:
|
||||
"""Gets the string representation of an error code enum name.
|
||||
"""Get the string representation of an error code enum name.
|
||||
|
||||
Raises:
|
||||
CUDAError_InvalidValue:
|
||||
|
|
@ -455,7 +451,6 @@ def cuGetErrorName(error: int) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuGetErrorName')
|
||||
|
||||
p_name = _ctypes.POINTER(_ctypes.c_char_p)()
|
||||
|
|
@ -466,7 +461,7 @@ def cuGetErrorName(error: int) -> str:
|
|||
|
||||
|
||||
def cuGetErrorString(error: int) -> str:
|
||||
"""Gets the string description of an error code.
|
||||
"""Get the string description of an error code.
|
||||
|
||||
Raises:
|
||||
CUDAError_InvalidValue:
|
||||
|
|
@ -474,7 +469,6 @@ def cuGetErrorString(error: int) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuGetErrorString')
|
||||
|
||||
p_name = _ctypes.POINTER(_ctypes.c_char_p)()
|
||||
|
|
@ -485,7 +479,7 @@ def cuGetErrorString(error: int) -> str:
|
|||
|
||||
|
||||
def cuDriverGetVersion() -> str:
|
||||
"""Returns the latest CUDA version supported by driver.
|
||||
"""Get the latest CUDA version supported by driver.
|
||||
|
||||
Returns:
|
||||
A string of the form :data:`'<major>.<minor>'`.
|
||||
|
|
@ -496,7 +490,6 @@ def cuDriverGetVersion() -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDriverGetVersion')
|
||||
|
||||
driver_version = _ctypes.c_int()
|
||||
|
|
@ -508,7 +501,7 @@ def cuDriverGetVersion() -> str:
|
|||
|
||||
|
||||
def cuDeviceGetCount() -> int:
|
||||
"""Returns the number of compute-capable devices.
|
||||
"""Get the number of compute-capable devices.
|
||||
|
||||
Returns: int
|
||||
The number of devices with compute capability greater than or equal to 2.0 that are available
|
||||
|
|
@ -524,7 +517,6 @@ def cuDeviceGetCount() -> int:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetCount')
|
||||
|
||||
count = _ctypes.c_int(0)
|
||||
|
|
@ -533,8 +525,8 @@ def cuDeviceGetCount() -> int:
|
|||
return count.value
|
||||
|
||||
|
||||
def cuDeviceGet(ordinal: int) -> c_CUdevice_t:
|
||||
"""Returns a handle to a compute device.
|
||||
def cuDeviceGet(ordinal: int) -> _c_CUdevice_t:
|
||||
"""Get a handle to a compute device.
|
||||
|
||||
Returns:
|
||||
A device handle given an ordinal in the range :code:`[0, ..., cuDeviceGetCount() - 1]`.
|
||||
|
|
@ -552,20 +544,19 @@ def cuDeviceGet(ordinal: int) -> c_CUdevice_t:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGet')
|
||||
|
||||
device = c_CUdevice_t()
|
||||
device = _c_CUdevice_t()
|
||||
ret = fn(_ctypes.byref(device), _ctypes.c_int(ordinal))
|
||||
_cudaCheckReturn(ret)
|
||||
return device
|
||||
|
||||
|
||||
def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t:
|
||||
"""Returns a handle to a compute device.
|
||||
def cuDeviceGetByPCIBusId(pciBusId: str) -> _c_CUdevice_t:
|
||||
"""Get a handle to a compute device.
|
||||
|
||||
Args:
|
||||
pciBusId: str
|
||||
pciBusId (str):
|
||||
String in one of the following forms: ``[domain]:[bus]:[device].[function]``,
|
||||
``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``,
|
||||
``device``, and ``function`` are all hexadecimal values.
|
||||
|
|
@ -584,17 +575,16 @@ def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetByPCIBusId')
|
||||
|
||||
device = c_CUdevice_t()
|
||||
device = _c_CUdevice_t()
|
||||
ret = fn(_ctypes.byref(device), _ctypes.c_char_p(pciBusId.encode('UTF-8')))
|
||||
_cudaCheckReturn(ret)
|
||||
return device
|
||||
|
||||
|
||||
def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
|
||||
"""Returns a PCI Bus Id string for the device.
|
||||
def cuDeviceGetPCIBusId(device: _c_CUdevice_t) -> str:
|
||||
"""Get a PCI Bus Id string for the device.
|
||||
|
||||
Returns: str
|
||||
An identifier string for the device in the following format ``[domain]:[bus]:[device].[function]``
|
||||
|
|
@ -611,7 +601,6 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetPCIBusId')
|
||||
|
||||
pciBusId = _ctypes.create_string_buffer(256)
|
||||
|
|
@ -620,8 +609,8 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str:
|
|||
return pciBusId.value.decode('UTF-8', errors='replace')
|
||||
|
||||
|
||||
def cuDeviceGetName(device: c_CUdevice_t) -> str:
|
||||
"""Returns an identifier string for the device.
|
||||
def cuDeviceGetName(device: _c_CUdevice_t) -> str:
|
||||
"""Get an identifier string for the device.
|
||||
|
||||
Returns: str
|
||||
An ASCII string identifying the device.
|
||||
|
|
@ -639,7 +628,6 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetName')
|
||||
|
||||
name = _ctypes.create_string_buffer(256)
|
||||
|
|
@ -648,8 +636,8 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str:
|
|||
return name.value.decode('UTF-8', errors='replace')
|
||||
|
||||
|
||||
def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
|
||||
"""Returns a UUID for the device.
|
||||
def cuDeviceGetUuid(device: _c_CUdevice_t) -> str:
|
||||
"""Get a UUID for the device.
|
||||
|
||||
Raises:
|
||||
CUDAError_InvalidDevice:
|
||||
|
|
@ -662,7 +650,6 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
try:
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2')
|
||||
except AttributeError:
|
||||
|
|
@ -676,8 +663,8 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str:
|
|||
return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32]))
|
||||
|
||||
|
||||
def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
|
||||
"""Returns a UUID for the device (CUDA 11.4+).
|
||||
def cuDeviceGetUuid_v2(device: _c_CUdevice_t) -> str:
|
||||
"""Get a UUID for the device (CUDA 11.4+).
|
||||
|
||||
Raises:
|
||||
CUDAError_InvalidDevice:
|
||||
|
|
@ -690,7 +677,6 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2')
|
||||
|
||||
ubyte_array = _ctypes.c_ubyte * 16
|
||||
|
|
@ -701,8 +687,8 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str:
|
|||
return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32]))
|
||||
|
||||
|
||||
def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
|
||||
"""Returns the total amount of memory on the device (in bytes).
|
||||
def cuDeviceTotalMem(device: _c_CUdevice_t) -> int:
|
||||
"""Get the total amount of memory on the device (in bytes).
|
||||
|
||||
Raises:
|
||||
CUDAError_InvalidContext:
|
||||
|
|
@ -717,7 +703,6 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
|
|||
CUDAError_NotInitialized:
|
||||
If the CUDA driver API is not initialized.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cuDeviceTotalMem')
|
||||
|
||||
bytes = _ctypes.c_size_t() # pylint: disable=redefined-builtin
|
||||
|
|
@ -727,8 +712,7 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int:
|
|||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Whether there are any CUDA visible devices."""
|
||||
|
||||
"""Test whether there are any CUDA visible devices."""
|
||||
try:
|
||||
return cuDeviceGetCount() > 0
|
||||
except CUDAError:
|
||||
|
|
|
|||
|
|
@ -281,8 +281,7 @@ class cudaError(Exception):
|
|||
_errcode_to_name = {}
|
||||
|
||||
def __new__(cls, value: int) -> 'cudaError':
|
||||
"""Maps value to a proper subclass of :class:`cudaError`."""
|
||||
|
||||
"""Map value to a proper subclass of :class:`cudaError`."""
|
||||
if cls is cudaError:
|
||||
# pylint: disable-next=self-cls-assignment
|
||||
cls = cudaError._value_class_mapping.get(value, cls)
|
||||
|
|
@ -291,6 +290,7 @@ class cudaError(Exception):
|
|||
return obj
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the error."""
|
||||
# pylint: disable=no-member
|
||||
try:
|
||||
if self.value not in cudaError._errcode_to_string:
|
||||
|
|
@ -307,30 +307,32 @@ class cudaError(Exception):
|
|||
except cudaError:
|
||||
return f'CUDA Error with code {self.value}.'
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""Test equality to other object."""
|
||||
if not isinstance(other, cudaError):
|
||||
return NotImplemented
|
||||
return self.value == other.value # pylint: disable=no-member
|
||||
|
||||
def __reduce__(self) -> _Tuple[_Type['cudaError'], _Tuple[int]]:
|
||||
"""Return state information for pickling."""
|
||||
return cudaError, (self.value,) # pylint: disable=no-member
|
||||
|
||||
|
||||
def cudaExceptionClass(cudaErrorCode: int) -> _Type[cudaError]:
|
||||
"""Maps value to a proper subclass of :class:`cudaError`.
|
||||
"""Map value to a proper subclass of :class:`cudaError`.
|
||||
|
||||
Raises:
|
||||
ValueError: If the error code is not valid.
|
||||
"""
|
||||
|
||||
# pylint: disable=protected-access
|
||||
if cudaErrorCode not in cudaError._value_class_mapping:
|
||||
if cudaErrorCode not in cudaError._value_class_mapping: # pylint: disable=protected-access
|
||||
raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.')
|
||||
return cudaError._value_class_mapping[cudaErrorCode]
|
||||
return cudaError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access
|
||||
|
||||
|
||||
def _extract_cuda_errors_as_classes() -> None:
|
||||
"""Generates a hierarchy of classes on top of :class:`cudaError` class.
|
||||
"""Generate a hierarchy of classes on top of :class:`cudaError` class.
|
||||
|
||||
Each CUDA Error gets a new :class:`cudaError` subclass. This way try-except blocks can filter
|
||||
appropriate exceptions more easily.
|
||||
|
|
@ -338,7 +340,6 @@ def _extract_cuda_errors_as_classes() -> None:
|
|||
:class:`cudaError` is a parent class. Each ``cudaError*`` gets it's own subclass.
|
||||
e.g. :data:`cudaErrorInvalidValue` will be turned into :class:`cudaError_InvalidValue`.
|
||||
"""
|
||||
|
||||
this_module = _sys.modules[__name__]
|
||||
cuda_error_names = [
|
||||
x
|
||||
|
|
@ -393,8 +394,7 @@ __cudaGetFunctionPointer_cache = {}
|
|||
|
||||
|
||||
def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
||||
"""
|
||||
Get the function pointer from the CUDA Runtime library.
|
||||
"""Get the function pointer from the CUDA Runtime library.
|
||||
|
||||
Raises:
|
||||
cudaError_InitializationError:
|
||||
|
|
@ -402,7 +402,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
|||
cudaError_SymbolNotFound:
|
||||
If cannot found the function pointer.
|
||||
"""
|
||||
|
||||
if name in __cudaGetFunctionPointer_cache:
|
||||
return __cudaGetFunctionPointer_cache[name]
|
||||
|
||||
|
|
@ -418,14 +417,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr:
|
|||
|
||||
|
||||
def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches
|
||||
"""
|
||||
Load the library if it isn't loaded already.
|
||||
"""Load the library if it isn't loaded already.
|
||||
|
||||
Raises:
|
||||
cudaError_InitializationError:
|
||||
If cannot found the CUDA Runtime library.
|
||||
"""
|
||||
|
||||
global __cudaLib # pylint: disable=global-statement
|
||||
|
||||
if __cudaLib is None:
|
||||
|
|
@ -498,7 +495,7 @@ def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches
|
|||
|
||||
|
||||
def cudaGetErrorName(error: int) -> str:
|
||||
"""Returns the string representation of an error code enum name.
|
||||
"""Get the string representation of an error code enum name.
|
||||
|
||||
Returns: str
|
||||
A string containing the name of an error code in the enum. If the error code is not
|
||||
|
|
@ -508,7 +505,6 @@ def cudaGetErrorName(error: int) -> str:
|
|||
cudaError_InitializationError:
|
||||
If cannot found the CUDA Runtime library.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaGetErrorName')
|
||||
|
||||
fn.restype = _ctypes.c_char_p # otherwise return is an int
|
||||
|
|
@ -518,7 +514,7 @@ def cudaGetErrorName(error: int) -> str:
|
|||
|
||||
|
||||
def cuGetErrorString(error: int) -> str:
|
||||
"""Returns the description string for an error code.
|
||||
"""Get the description string for an error code.
|
||||
|
||||
Returns: str
|
||||
The description string for an error code. If the error code is not recognized, "unrecognized
|
||||
|
|
@ -528,7 +524,6 @@ def cuGetErrorString(error: int) -> str:
|
|||
cudaError_InitializationError:
|
||||
If cannot found the CUDA Runtime library.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaGetErrorString')
|
||||
|
||||
fn.restype = _ctypes.c_char_p # otherwise return is an int
|
||||
|
|
@ -538,7 +533,7 @@ def cuGetErrorString(error: int) -> str:
|
|||
|
||||
|
||||
def cudaGetLastError() -> int:
|
||||
"""Returns the last error from a runtime call.
|
||||
"""Get the last error from a runtime call.
|
||||
|
||||
Returns: int
|
||||
The last error that has been produced by any of the runtime calls in the same instance of
|
||||
|
|
@ -552,13 +547,12 @@ def cudaGetLastError() -> int:
|
|||
cudaError_NoDevice:
|
||||
If no CUDA-capable devices were detected by the installed CUDA driver.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaGetLastError')
|
||||
return fn()
|
||||
|
||||
|
||||
def cudaPeekAtLastError() -> int:
|
||||
"""Returns the last error from a runtime call.
|
||||
"""Get the last error from a runtime call.
|
||||
|
||||
Returns: int
|
||||
The last error that has been produced by any of the runtime calls in the same instance of
|
||||
|
|
@ -573,13 +567,12 @@ def cudaPeekAtLastError() -> int:
|
|||
cudaError_NoDevice:
|
||||
If no CUDA-capable devices were detected by the installed CUDA driver.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaPeekAtLastError')
|
||||
return fn()
|
||||
|
||||
|
||||
def cudaDriverGetVersion() -> str:
|
||||
"""Returns the latest CUDA version supported by driver.
|
||||
"""Get the latest CUDA version supported by driver.
|
||||
|
||||
Returns: str
|
||||
The latest version of CUDA supported by the driver of the form :data:`'<major>.<minor>'`.
|
||||
|
|
@ -592,7 +585,6 @@ def cudaDriverGetVersion() -> str:
|
|||
cudaError_NoDevice:
|
||||
If no CUDA-capable devices were detected by the installed CUDA driver.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaDriverGetVersion')
|
||||
|
||||
driver_version = _ctypes.c_int()
|
||||
|
|
@ -604,7 +596,7 @@ def cudaDriverGetVersion() -> str:
|
|||
|
||||
|
||||
def cudaRuntimeGetVersion() -> str:
|
||||
"""Returns the CUDA Runtime version.
|
||||
"""Get the CUDA Runtime version.
|
||||
|
||||
Returns: str
|
||||
The version number of the current CUDA Runtime instance of the form :data:`'<major>.<minor>'`.
|
||||
|
|
@ -617,7 +609,6 @@ def cudaRuntimeGetVersion() -> str:
|
|||
cudaError_NoDevice:
|
||||
If no CUDA-capable devices were detected by the installed CUDA driver.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaRuntimeGetVersion')
|
||||
|
||||
runtime_version = _ctypes.c_int()
|
||||
|
|
@ -629,7 +620,7 @@ def cudaRuntimeGetVersion() -> str:
|
|||
|
||||
|
||||
def cudaGetDeviceCount() -> int:
|
||||
"""Returns the number of compute-capable devices.
|
||||
"""Get the number of compute-capable devices.
|
||||
|
||||
Returns: int
|
||||
The number of devices with compute capability greater or equal to 2.0 that are available for
|
||||
|
|
@ -643,7 +634,6 @@ def cudaGetDeviceCount() -> int:
|
|||
cudaError_NoDevice:
|
||||
If no CUDA-capable devices were detected by the installed CUDA driver.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaGetDeviceCount')
|
||||
|
||||
count = _ctypes.c_int(0)
|
||||
|
|
@ -653,10 +643,10 @@ def cudaGetDeviceCount() -> int:
|
|||
|
||||
|
||||
def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
|
||||
"""Returns a handle to a compute device.
|
||||
"""Get a handle to a compute device.
|
||||
|
||||
Args:
|
||||
pciBusId: str
|
||||
pciBusId (str):
|
||||
String in one of the following forms: ``[domain]:[bus]:[device].[function]``,
|
||||
``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``,
|
||||
``device``, and ``function`` are all hexadecimal values.
|
||||
|
|
@ -676,7 +666,6 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
|
|||
cudaError_InvalidDevice:
|
||||
If the device ordinal supplied by the user does not correspond to a valid CUDA device.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaDeviceGetByPCIBusId')
|
||||
|
||||
device = _ctypes.c_int()
|
||||
|
|
@ -686,7 +675,7 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int:
|
|||
|
||||
|
||||
def cudaDeviceGetPCIBusId(device: int) -> str:
|
||||
"""Returns a PCI Bus Id string for the device.
|
||||
"""Get a PCI Bus Id string for the device.
|
||||
|
||||
Returns: str
|
||||
An ASCII string identifying the device.
|
||||
|
|
@ -703,7 +692,6 @@ def cudaDeviceGetPCIBusId(device: int) -> str:
|
|||
cudaError_InvalidDevice:
|
||||
If the device ordinal supplied by the user does not correspond to a valid CUDA device.
|
||||
"""
|
||||
|
||||
fn = __cudaGetFunctionPointer('cudaDeviceGetPCIBusId')
|
||||
|
||||
pciBusId = _ctypes.create_string_buffer(256)
|
||||
|
|
@ -713,8 +701,7 @@ def cudaDeviceGetPCIBusId(device: int) -> str:
|
|||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Whether there are any CUDA visible devices."""
|
||||
|
||||
"""Test whether there are any CUDA visible devices."""
|
||||
try:
|
||||
return cudaGetDeviceCount() > 0
|
||||
except cudaError:
|
||||
|
|
|
|||
|
|
@ -67,9 +67,9 @@ if not callable(getattr(_pynvml, 'nvmlInitWithFlags', None)):
|
|||
|
||||
NVMLError = _pynvml.NVMLError
|
||||
NVMLError.__doc__ = """Base exception class for NVML query errors."""
|
||||
NVMLError.__new__.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`."""
|
||||
NVMLError.__new__.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""
|
||||
nvmlExceptionClass = _pynvml.nvmlExceptionClass
|
||||
nvmlExceptionClass.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`."""
|
||||
nvmlExceptionClass.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""
|
||||
|
||||
# Load members from module `pynvml` and register them in `__all__` and globals.
|
||||
_vars_pynvml = vars(_pynvml)
|
||||
|
|
@ -143,7 +143,7 @@ Functions and Exceptions
|
|||
|
||||
.. function:: __exit__(*args, **kwargs) -> None
|
||||
|
||||
Shutdowns the NVML context in the context manager for ``with`` statement.
|
||||
Shutdown the NVML context in the context manager for ``with`` statement.
|
||||
|
||||
""".format('\n\n'.join(_data_docs)) # fmt: skip
|
||||
|
||||
|
|
@ -203,7 +203,7 @@ VERSIONED_PATTERN = _re.compile(r'^(?P<name>\w+)(?P<suffix>_v(\d)+)$')
|
|||
|
||||
|
||||
def _lazy_init() -> None:
|
||||
"""Lazily initializes the NVML context.
|
||||
"""Lazily initialize the NVML context.
|
||||
|
||||
Raises:
|
||||
NVMLError_LibraryNotFound:
|
||||
|
|
@ -217,7 +217,6 @@ def _lazy_init() -> None:
|
|||
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
|
||||
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
|
||||
"""
|
||||
|
||||
with __lock:
|
||||
if __initialized:
|
||||
return
|
||||
|
|
@ -225,7 +224,7 @@ def _lazy_init() -> None:
|
|||
|
||||
|
||||
def nvmlInit() -> None: # pylint: disable=function-redefined
|
||||
"""Initializes the NVML context with default flag (0).
|
||||
"""Initialize the NVML context with default flag (0).
|
||||
|
||||
Raises:
|
||||
NVMLError_LibraryNotFound:
|
||||
|
|
@ -239,12 +238,11 @@ def nvmlInit() -> None: # pylint: disable=function-redefined
|
|||
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
|
||||
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
|
||||
"""
|
||||
|
||||
nvmlInitWithFlags(0)
|
||||
|
||||
|
||||
def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
|
||||
"""Initializes the NVML context with the given flags.
|
||||
"""Initialize the NVML context with the given flags.
|
||||
|
||||
Raises:
|
||||
NVMLError_LibraryNotFound:
|
||||
|
|
@ -258,7 +256,6 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
|
|||
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
|
||||
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
|
||||
"""
|
||||
|
||||
global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned
|
||||
|
||||
with __lock:
|
||||
|
|
@ -312,7 +309,7 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined
|
|||
|
||||
|
||||
def nvmlShutdown() -> None: # pylint: disable=function-redefined
|
||||
"""Shutdowns the NVML context.
|
||||
"""Shutdown the NVML context.
|
||||
|
||||
Raises:
|
||||
NVMLError_LibraryNotFound:
|
||||
|
|
@ -325,7 +322,6 @@ def nvmlShutdown() -> None: # pylint: disable=function-redefined
|
|||
NVMLError_Uninitialized:
|
||||
If NVML was not first initialized with :func:`nvmlInit`.
|
||||
"""
|
||||
|
||||
global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned
|
||||
|
||||
_pynvml.nvmlShutdown()
|
||||
|
|
@ -345,8 +341,9 @@ def nvmlQuery(
|
|||
ignore_function_not_found: bool = False,
|
||||
**kwargs,
|
||||
) -> _Any:
|
||||
"""Calls a function with the given arguments from NVML. The NVML context will be automatically
|
||||
initialized.
|
||||
"""Call a function with the given arguments from NVML.
|
||||
|
||||
The NVML context will be automatically initialized.
|
||||
|
||||
Args:
|
||||
func (Union[Callable[..., Any], str]):
|
||||
|
|
@ -380,7 +377,6 @@ def nvmlQuery(
|
|||
NVMLError_InvalidArgument:
|
||||
If passed with an invalid argument.
|
||||
"""
|
||||
|
||||
global UNKNOWN_FUNCTIONS # pylint: disable=global-statement,global-variable-not-assigned
|
||||
|
||||
_lazy_init()
|
||||
|
|
@ -429,8 +425,7 @@ def nvmlQuery(
|
|||
def nvmlCheckReturn(
|
||||
retval: _Any, types: _Optional[_Union[_Type, _Tuple[_Type, ...]]] = None
|
||||
) -> bool:
|
||||
"""Checks the return value is not :const:`nvitop.NA` and is one of the given types."""
|
||||
|
||||
"""Check whether the return value is not :const:`nvitop.NA` and is one of the given types."""
|
||||
if types is None:
|
||||
return retval != NA
|
||||
return retval != NA and isinstance(retval, types)
|
||||
|
|
@ -474,8 +469,6 @@ def __patch_backward_compatibility_layers() -> None:
|
|||
)
|
||||
|
||||
def patch_function_pointers_when_fail(names, callback):
|
||||
"""Patches the function pointers of the NVML library."""
|
||||
|
||||
def wrapper(nvmlGetFunctionPointer):
|
||||
@_functools.wraps(nvmlGetFunctionPointer)
|
||||
def wrapped(name):
|
||||
|
|
@ -586,7 +579,7 @@ _driver_get_memory_info_v2_available = None if not _pynvml_installation_corrupte
|
|||
|
||||
|
||||
def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-many-branches
|
||||
"""Retrieves the amount of used, free, reserved and total memory available on the device, in bytes.
|
||||
"""Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.
|
||||
|
||||
Note:
|
||||
- The version 2 API adds additional memory information. The reserved amount is supported on
|
||||
|
|
@ -607,7 +600,6 @@ def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-m
|
|||
NVMLError_Unknown:
|
||||
On any unexpected error.
|
||||
"""
|
||||
|
||||
global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement
|
||||
|
||||
_lazy_init()
|
||||
|
|
@ -702,8 +694,7 @@ class _CustomModule(_ModuleType):
|
|||
"""
|
||||
|
||||
def __getattribute__(self, name: str) -> _Union[_Any, _Callable[..., _Any]]:
|
||||
"""Gets a member from the current module. Fallback to the original package if missing."""
|
||||
|
||||
"""Get a member from the current module. Fallback to the original package if missing."""
|
||||
try:
|
||||
return super().__getattribute__(name)
|
||||
except AttributeError:
|
||||
|
|
@ -711,18 +702,15 @@ class _CustomModule(_ModuleType):
|
|||
|
||||
def __enter__(self) -> '_CustomModule':
|
||||
"""Entry of the context manager for ``with`` statement."""
|
||||
|
||||
_lazy_init()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs) -> None:
|
||||
"""Shutdowns the NVML context in the context manager for ``with`` statement."""
|
||||
|
||||
"""Shutdown the NVML context in the context manager for ``with`` statement."""
|
||||
self.__del__()
|
||||
|
||||
def __del__(self) -> None:
|
||||
"""Automatically shutdowns the NVML context on destruction."""
|
||||
|
||||
"""Automatically shutdown the NVML context on destruction."""
|
||||
try:
|
||||
nvmlShutdown()
|
||||
except NVMLError:
|
||||
|
|
|
|||
|
|
@ -49,8 +49,7 @@ __all__ = ['HostProcess', 'GpuProcess', 'command_join']
|
|||
if host.POSIX:
|
||||
|
||||
def add_quotes(s: str) -> str:
|
||||
"""Returns a shell-escaped version of the string."""
|
||||
|
||||
"""Return a shell-escaped version of the string."""
|
||||
if s == '':
|
||||
return '""'
|
||||
if '$' not in s and '\\' not in s and '\n' not in s:
|
||||
|
|
@ -67,8 +66,7 @@ if host.POSIX:
|
|||
elif host.WINDOWS:
|
||||
|
||||
def add_quotes(s: str) -> str:
|
||||
"""Returns a shell-escaped version of the string."""
|
||||
|
||||
"""Return a shell-escaped version of the string."""
|
||||
if s == '':
|
||||
return '""'
|
||||
if '%' not in s and '^' not in s and '\n' not in s:
|
||||
|
|
@ -83,14 +81,12 @@ elif host.WINDOWS:
|
|||
else:
|
||||
|
||||
def add_quotes(s: str) -> str:
|
||||
"""Returns a shell-escaped version of the string."""
|
||||
|
||||
"""Return a shell-escaped version of the string."""
|
||||
return '"{}"'.format(s.replace('\n', r'\n'))
|
||||
|
||||
|
||||
def command_join(cmdline: List[str]) -> str:
|
||||
"""Returns a shell-escaped string from command line arguments."""
|
||||
|
||||
"""Return a shell-escaped string from command line arguments."""
|
||||
if len(cmdline) == 1 and not (
|
||||
# May be modified by `setproctitle`
|
||||
os.path.isfile(cmdline[0])
|
||||
|
|
@ -105,10 +101,10 @@ _USE_FALLBACK_WHEN_RAISE = threading.local() # see also `GpuProcess.failsafe`
|
|||
|
||||
|
||||
def auto_garbage_clean(fallback=_RAISE):
|
||||
"""Removes the object references in the instance cache if the method call fails (the process is gone).
|
||||
"""Remove the object references in the instance cache if the method call fails (the process is gone).
|
||||
|
||||
The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise raises an
|
||||
exception when falls.
|
||||
The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise
|
||||
raises an exception when falls.
|
||||
"""
|
||||
|
||||
def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
|
|
@ -143,12 +139,12 @@ def auto_garbage_clean(fallback=_RAISE):
|
|||
|
||||
|
||||
class HostProcess(host.Process, metaclass=ABCMeta):
|
||||
"""Represents an OS process with the given PID.
|
||||
If PID is omitted current process PID (:func:`os.getpid`) is used.
|
||||
The instance will be cache during the lifetime of the process.
|
||||
"""Represent an OS process with the given PID.
|
||||
|
||||
If PID is omitted current process PID (:func:`os.getpid`) is used. The instance will be cache
|
||||
during the lifetime of the process.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> HostProcess() # the current process
|
||||
HostProcess(pid=12345, name='python3', status='running', started='00:55:43')
|
||||
|
||||
|
|
@ -186,8 +182,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
INSTANCES = WeakValueDictionary()
|
||||
|
||||
def __new__(cls, pid: Optional[int] = None) -> 'HostProcess':
|
||||
"""Returns the cached instance of :class:`HostProcess`."""
|
||||
|
||||
"""Return the cached instance of :class:`HostProcess`."""
|
||||
if pid is None:
|
||||
pid = os.getpid()
|
||||
|
||||
|
|
@ -215,7 +210,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
|
||||
# pylint: disable-next=unused-argument,super-init-not-called
|
||||
def __init__(self, pid: Optional[int] = None) -> None:
|
||||
pass
|
||||
"""Initialize the instance."""
|
||||
|
||||
@property
|
||||
def _gone(self) -> bool:
|
||||
|
|
@ -232,17 +227,20 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
self._super_gone = value
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the process."""
|
||||
return super().__str__().replace(self.__class__.__module__ + '.', '', 1)
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def __reduce__(self) -> Tuple[Type['HostProcess'], Tuple[int]]:
|
||||
"""Return state information for pickling."""
|
||||
return self.__class__, (self.pid,)
|
||||
|
||||
if host.WINDOWS:
|
||||
|
||||
def username(self) -> str:
|
||||
"""The name of the user that owns the process.
|
||||
|
||||
On Windows, the domain name will be removed if it is present.
|
||||
|
||||
Raises:
|
||||
|
|
@ -251,7 +249,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
if self._username is None: # pylint: disable=access-member-before-definition
|
||||
self._username = ( # pylint: disable=attribute-defined-outside-init
|
||||
super().username().split('\\')[-1]
|
||||
|
|
@ -262,6 +259,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
|
||||
def username(self) -> str:
|
||||
"""The name of the user that owns the process.
|
||||
|
||||
On UNIX this is calculated by using *real* process uid.
|
||||
|
||||
Raises:
|
||||
|
|
@ -270,7 +268,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
if self._username is None: # pylint: disable=access-member-before-definition
|
||||
self._username = ( # pylint: disable=attribute-defined-outside-init
|
||||
super().username()
|
||||
|
|
@ -287,14 +284,13 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
cmdline = super().cmdline()
|
||||
if len(cmdline) > 1:
|
||||
cmdline = '\0'.join(cmdline).rstrip('\0').split('\0')
|
||||
return cmdline
|
||||
|
||||
def command(self) -> str:
|
||||
"""Returns a shell-escaped string from command line arguments.
|
||||
"""Return a shell-escaped string from command line arguments.
|
||||
|
||||
Raises:
|
||||
host.NoSuchProcess:
|
||||
|
|
@ -302,7 +298,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return command_join(self.cmdline())
|
||||
|
||||
@memoize_when_activated
|
||||
|
|
@ -315,7 +310,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return datetime.datetime.now() - datetime.datetime.fromtimestamp(self.create_time())
|
||||
|
||||
def running_time_human(self) -> str:
|
||||
|
|
@ -327,7 +321,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return timedelta2human(self.running_time())
|
||||
|
||||
def running_time_in_seconds(self) -> float: # in seconds
|
||||
|
|
@ -339,7 +332,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return self.running_time().total_seconds()
|
||||
|
||||
elapsed_time = running_time
|
||||
|
|
@ -355,11 +347,10 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return self.memory_info().rss
|
||||
|
||||
def parent(self) -> Union['HostProcess', None]:
|
||||
"""Returns the parent process as a :class:`HostProcess` instance. Returns :data:`None` if there is no parent.
|
||||
"""Return the parent process as a :class:`HostProcess` instance or :data:`None` if there is no parent.
|
||||
|
||||
Raises:
|
||||
host.NoSuchProcess:
|
||||
|
|
@ -367,7 +358,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
parent = super().parent()
|
||||
if parent is not None:
|
||||
return HostProcess(parent.pid)
|
||||
|
|
@ -375,6 +365,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
|
||||
def children(self, recursive: bool = False) -> List['HostProcess']:
|
||||
"""Return the children of this process as a list of :class:`HostProcess` instances.
|
||||
|
||||
If *recursive* is :data:`True` return all the descendants.
|
||||
|
||||
Raises:
|
||||
|
|
@ -383,13 +374,11 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
return [HostProcess(child.pid) for child in super().children(recursive)]
|
||||
|
||||
@contextlib.contextmanager
|
||||
def oneshot(self):
|
||||
"""Utility context manager which considerably speeds up the retrieval of multiple process
|
||||
information at the same time.
|
||||
"""A utility context manager which considerably speeds up the retrieval of multiple process information at the same time.
|
||||
|
||||
Internally different process info (e.g. name, ppid, uids, gids, ...) may be fetched by using
|
||||
the same routine, but only one information is returned and the others are discarded. When
|
||||
|
|
@ -400,7 +389,6 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
time you retrieve more than one information about the process.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from nvitop import HostProcess
|
||||
>>> p = HostProcess()
|
||||
>>> with p.oneshot():
|
||||
|
|
@ -408,8 +396,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
... p.cpu_times() # return cached value
|
||||
... p.cpu_percent() # return cached value
|
||||
... p.create_time() # return cached value
|
||||
"""
|
||||
|
||||
""" # pylint: disable=line-too-long
|
||||
with self._lock:
|
||||
if hasattr(self, '_cache'):
|
||||
yield
|
||||
|
|
@ -427,8 +414,7 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
def as_snapshot(
|
||||
self, attrs: Optional[Iterable[str]] = None, ad_value: Optional[Any] = None
|
||||
) -> Snapshot:
|
||||
"""Returns a onetime snapshot of the process."""
|
||||
|
||||
"""Return a onetime snapshot of the process."""
|
||||
with self.oneshot():
|
||||
attributes = self.as_dict(attrs=attrs, ad_value=ad_value)
|
||||
|
||||
|
|
@ -444,11 +430,12 @@ class HostProcess(host.Process, metaclass=ABCMeta):
|
|||
|
||||
@HostProcess.register
|
||||
class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-public-methods
|
||||
"""Represents a process with the given PID running on the given GPU device.
|
||||
"""Represent a process with the given PID running on the given GPU device.
|
||||
|
||||
The instance will be cache during the lifetime of the process.
|
||||
|
||||
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances representing the
|
||||
same PID on the host but different GPU devices are different.
|
||||
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances
|
||||
representing the same PID on the host but different GPU devices are different.
|
||||
"""
|
||||
|
||||
INSTANCE_LOCK = threading.RLock()
|
||||
|
|
@ -466,8 +453,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin
|
||||
# pylint: enable=unused-argument
|
||||
) -> 'GpuProcess':
|
||||
"""Returns the cached instance of :class:`GpuProcess`."""
|
||||
|
||||
"""Return the cached instance of :class:`GpuProcess`."""
|
||||
if pid is None:
|
||||
pid = os.getpid()
|
||||
|
||||
|
|
@ -503,8 +489,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
compute_instance_id: Optional[Union[int, NaType]] = None,
|
||||
type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin
|
||||
) -> None:
|
||||
"""Initializes the instance returned by :meth:`__new__()`."""
|
||||
|
||||
"""Initialize the instance returned by :meth:`__new__()`."""
|
||||
if gpu_memory is None and not hasattr(self, '_gpu_memory'):
|
||||
gpu_memory = NA
|
||||
if gpu_memory is not None:
|
||||
|
|
@ -531,6 +516,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
setattr(self, f'_gpu_{util}_utilization', NA)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the GPU process."""
|
||||
return '{}(pid={}, gpu_memory={}, type={}, device={}, host={})'.format(
|
||||
self.__class__.__name__,
|
||||
self.pid,
|
||||
|
|
@ -543,20 +529,19 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
__repr__ = __str__
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""Test equality to other object."""
|
||||
if not isinstance(other, (GpuProcess, host.Process)):
|
||||
return NotImplemented
|
||||
return self._ident == other._ident
|
||||
|
||||
def __ne__(self, other: object) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
"""Return a hash value of the GPU process."""
|
||||
if self._hash is None: # pylint: disable=access-member-before-definition
|
||||
self._hash = hash(self._ident) # pylint: disable=attribute-defined-outside-init
|
||||
return self._hash
|
||||
|
||||
def __getattr__(self, name: str) -> Union[Any, Callable[..., Any]]:
|
||||
"""Gets a member from the instance. Fallback to the host process instance if missing.
|
||||
"""Get a member from the instance or fallback to the host process instance if missing.
|
||||
|
||||
Raises:
|
||||
AttributeError:
|
||||
|
|
@ -566,7 +551,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
host.AccessDenied:
|
||||
If the user do not have read privilege to the process' status file.
|
||||
"""
|
||||
|
||||
try:
|
||||
return super().__getattr__(name)
|
||||
except AttributeError:
|
||||
|
|
@ -582,74 +566,60 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
@property
|
||||
def pid(self) -> int:
|
||||
"""The process PID."""
|
||||
|
||||
return self._pid
|
||||
|
||||
@property
|
||||
def host(self) -> HostProcess:
|
||||
"""The process instance running on the host."""
|
||||
|
||||
return self._host
|
||||
|
||||
@property
|
||||
def device(self) -> 'Device':
|
||||
"""The GPU device the process running on.
|
||||
|
||||
The same host process can use multiple GPU devices.
|
||||
The :class:`GpuProcess` instances representing the same PID on the host
|
||||
but different GPU devices are different.
|
||||
The same host process can use multiple GPU devices. The :class:`GpuProcess` instances
|
||||
representing the same PID on the host but different GPU devices are different.
|
||||
"""
|
||||
|
||||
return self._device
|
||||
|
||||
def gpu_instance_id(self) -> Union[int, NaType]:
|
||||
"""The GPU instance ID of the MIG device, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_instance_id
|
||||
|
||||
def compute_instance_id(self) -> Union[int, NaType]:
|
||||
"""The compute instance ID of the MIG device, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._compute_instance_id
|
||||
|
||||
def gpu_memory(self) -> Union[int, NaType]: # in bytes
|
||||
"""The used GPU memory in bytes, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_memory
|
||||
|
||||
def gpu_memory_human(self) -> Union[str, NaType]: # in human readable
|
||||
"""The used GPU memory in human readable format, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_memory_human
|
||||
|
||||
def gpu_memory_percent(self) -> Union[float, NaType]: # in percentage
|
||||
"""The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_memory_percent
|
||||
|
||||
def gpu_sm_utilization(self) -> Union[int, NaType]: # in percentage
|
||||
"""The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_sm_utilization
|
||||
|
||||
def gpu_memory_utilization(self) -> Union[int, NaType]: # in percentage
|
||||
"""The utilization rate of GPU memory bandwidth, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_memory_utilization
|
||||
|
||||
def gpu_encoder_utilization(self) -> Union[int, NaType]: # in percentage
|
||||
"""The utilization rate of the encoder, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_encoder_utilization
|
||||
|
||||
def gpu_decoder_utilization(self) -> Union[int, NaType]: # in percentage
|
||||
"""The utilization rate of the decoder, or :const:`nvitop.NA` if not applicable."""
|
||||
|
||||
return self._gpu_decoder_utilization
|
||||
|
||||
def set_gpu_memory(self, value: Union[int, NaType]) -> None:
|
||||
"""Sets the used GPU memory in bytes."""
|
||||
|
||||
"""Set the used GPU memory in bytes."""
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
self._gpu_memory = memory_used = value
|
||||
self._gpu_memory_human = bytes2human(self.gpu_memory())
|
||||
|
|
@ -666,8 +636,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
gpu_encoder_utilization: Optional[int] = None,
|
||||
gpu_decoder_utilization: Optional[int] = None,
|
||||
) -> None:
|
||||
"""Sets the GPU utilization rates."""
|
||||
|
||||
"""Set the GPU utilization rates."""
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
if gpu_sm_utilization is not None:
|
||||
self._gpu_sm_utilization = gpu_sm_utilization
|
||||
|
|
@ -679,8 +648,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
self._gpu_decoder_utilization = gpu_decoder_utilization
|
||||
|
||||
def update_gpu_status(self) -> Union[int, NaType]:
|
||||
"""Updates the GPU consumption status from a new NVML query."""
|
||||
|
||||
"""Update the GPU consumption status from a new NVML query."""
|
||||
self.set_gpu_memory(NA)
|
||||
self.set_gpu_utilization(NA, NA, NA, NA)
|
||||
self.device.processes.cache_clear()
|
||||
|
|
@ -697,7 +665,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
- :data:`'C+G'`: both compute context and graphics context
|
||||
- :data:`'N/A'`: not applicable
|
||||
"""
|
||||
|
||||
return self._type
|
||||
|
||||
@type.setter
|
||||
|
|
@ -713,8 +680,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
|
||||
@auto_garbage_clean(fallback=False)
|
||||
def is_running(self) -> bool:
|
||||
"""Returns whether this process is running."""
|
||||
|
||||
"""Return whether this process is running."""
|
||||
return self.host.is_running()
|
||||
|
||||
@auto_garbage_clean(fallback='terminated')
|
||||
|
|
@ -731,7 +697,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.status()
|
||||
|
||||
@auto_garbage_clean(fallback=NA)
|
||||
|
|
@ -748,7 +713,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.create_time()
|
||||
|
||||
@auto_garbage_clean(fallback=NA)
|
||||
|
|
@ -765,7 +729,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.running_time()
|
||||
|
||||
def running_time_human(self) -> Union[str, NaType]:
|
||||
|
|
@ -781,7 +744,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return timedelta2human(self.running_time())
|
||||
|
||||
def running_time_in_seconds(self) -> Union[float, NaType]:
|
||||
|
|
@ -797,7 +759,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
running_time = self.running_time()
|
||||
if running_time is NA:
|
||||
return NA
|
||||
|
|
@ -821,7 +782,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
if self._username is None: # pylint: disable=access-member-before-definition
|
||||
self._username = self.host.username() # pylint: disable=attribute-defined-outside-init
|
||||
return self._username
|
||||
|
|
@ -840,12 +800,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.name()
|
||||
|
||||
@auto_garbage_clean(fallback=NA)
|
||||
def cpu_percent(self) -> Union[float, NaType]: # in percentage
|
||||
"""Returns a float representing the current process CPU utilization as a percentage.
|
||||
"""Return a float representing the current process CPU utilization as a percentage.
|
||||
|
||||
Raises:
|
||||
host.NoSuchProcess:
|
||||
|
|
@ -857,13 +816,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.cpu_percent()
|
||||
|
||||
@auto_garbage_clean(fallback=NA)
|
||||
def memory_percent(self) -> Union[float, NaType]: # in percentage
|
||||
"""Compares process RSS memory to total physical system memory
|
||||
and calculate process memory utilization as a percentage.
|
||||
"""Compare process RSS memory to total physical system memory and calculate process memory utilization as a percentage.
|
||||
|
||||
Raises:
|
||||
host.NoSuchProcess:
|
||||
|
|
@ -874,8 +831,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
Note:
|
||||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
""" # pylint: disable=line-too-long
|
||||
return self.host.memory_percent()
|
||||
|
||||
host_memory_percent = memory_percent # in percentage
|
||||
|
|
@ -894,7 +850,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return self.host.rss_memory()
|
||||
|
||||
def host_memory_human(self) -> Union[str, NaType]:
|
||||
|
|
@ -910,7 +865,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return bytes2human(self.host_memory())
|
||||
|
||||
rss_memory = host_memory # in bytes
|
||||
|
|
@ -930,14 +884,13 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
cmdline = self.host.cmdline()
|
||||
if len(cmdline) == 0 and not self._gone:
|
||||
cmdline = ['Zombie Process']
|
||||
return cmdline
|
||||
|
||||
def command(self) -> str:
|
||||
"""Returns a shell-escaped string from command line arguments.
|
||||
"""Return a shell-escaped string from command line arguments.
|
||||
|
||||
Raises:
|
||||
host.NoSuchProcess:
|
||||
|
|
@ -949,13 +902,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
To return the fallback value rather than raise an exception, please use the context
|
||||
manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
return command_join(self.cmdline())
|
||||
|
||||
@auto_garbage_clean(fallback=_RAISE)
|
||||
def host_snapshot(self) -> Snapshot:
|
||||
"""Returns a onetime snapshot of the host process."""
|
||||
|
||||
"""Return a onetime snapshot of the host process."""
|
||||
with self.host.oneshot():
|
||||
host_snapshot = Snapshot(
|
||||
real=self.host,
|
||||
|
|
@ -980,7 +931,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
def as_snapshot(
|
||||
self, *, host_process_snapshot_cache: Optional[Dict[int, Snapshot]] = None
|
||||
) -> Snapshot:
|
||||
"""Returns a onetime snapshot of the process on the GPU device.
|
||||
"""Return a onetime snapshot of the process on the GPU device.
|
||||
|
||||
Note:
|
||||
To return the fallback value rather than raise an exception, please use the context
|
||||
|
|
@ -988,7 +939,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
snapshots with :meth:`GpuProcess.take_snapshots`, which caches the results and reduces
|
||||
redundant queries. See also :meth:`take_snapshots` and :meth:`failsafe`.
|
||||
"""
|
||||
|
||||
host_process_snapshot_cache = host_process_snapshot_cache or {}
|
||||
try:
|
||||
host_snapshot = host_process_snapshot_cache[self.pid]
|
||||
|
|
@ -1031,12 +981,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
def take_snapshots( # batched version of `as_snapshot`
|
||||
cls, gpu_processes: Iterable['GpuProcess'], *, failsafe=False
|
||||
) -> List[Snapshot]:
|
||||
"""Takes snapshots for a list of :class:`GpuProcess` instances.
|
||||
"""Take snapshots for a list of :class:`GpuProcess` instances.
|
||||
|
||||
If *failsafe* is :data:`True`, then if any method fails, the fallback value in
|
||||
:func:`auto_garbage_clean` will be used.
|
||||
"""
|
||||
|
||||
cache = {}
|
||||
context = cls.failsafe if failsafe else contextlib.nullcontext
|
||||
with context():
|
||||
|
|
@ -1052,7 +1001,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
"""A context manager that enables fallback values for methods that fail.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> p = GpuProcess(pid=10000, device=Device(0)) # process does not exist
|
||||
>>> p
|
||||
GpuProcess(pid=10000, gpu_memory=N/A, type=N/A, device=PhysicalDevice(index=0, name="NVIDIA GeForce RTX 3070", total_memory=8192MiB), host=HostProcess(pid=10000, status='terminated'))
|
||||
|
|
@ -1070,7 +1018,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
|
|||
fallback (float cast): nan
|
||||
fallback (int cast): 0
|
||||
""" # pylint: disable=line-too-long
|
||||
|
||||
global _USE_FALLBACK_WHEN_RAISE # pylint: disable=global-statement,global-variable-not-assigned
|
||||
|
||||
prev_value = getattr(_USE_FALLBACK_WHEN_RAISE, 'value', False)
|
||||
|
|
|
|||
|
|
@ -76,8 +76,7 @@ COLOR = sys.stdout.isatty()
|
|||
|
||||
|
||||
def set_color(value: bool) -> None:
|
||||
"""Force enables text coloring."""
|
||||
|
||||
"""Force enable text coloring."""
|
||||
global COLOR # pylint: disable=global-statement
|
||||
COLOR = bool(value)
|
||||
|
||||
|
|
@ -88,7 +87,7 @@ def colored(
|
|||
on_color: Optional[str] = None,
|
||||
attrs: Iterable[str] = None,
|
||||
) -> str:
|
||||
"""Colorizes text.
|
||||
"""Colorize text with ANSI color escape codes.
|
||||
|
||||
Available text colors:
|
||||
red, green, yellow, blue, magenta, cyan, white.
|
||||
|
|
@ -100,11 +99,9 @@ def colored(
|
|||
bold, dark, underline, blink, reverse, concealed.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
|
||||
>>> colored('Hello, World!', 'green')
|
||||
"""
|
||||
|
||||
if COLOR:
|
||||
return _colored(text, color=color, on_color=on_color, attrs=attrs)
|
||||
return text
|
||||
|
|
@ -114,11 +111,10 @@ class NaType(str):
|
|||
"""A singleton (:const:`str: 'N/A'`) class represents a not applicable value.
|
||||
|
||||
The :const:`NA` instance behaves like a :class:`str` instance (:const:`'N/A'`) when doing string
|
||||
manipulation (e.g. concatenation). For arithmetic operations, for example :code:`NA / 1024 / 1024`,
|
||||
manipulation (e.g. concatenation). For arithmetic operations, for example ``NA / 1024 / 1024``,
|
||||
it acts like the :data:`math.nan`.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> NA
|
||||
'N/A'
|
||||
|
||||
|
|
@ -142,45 +138,42 @@ class NaType(str):
|
|||
"""
|
||||
|
||||
def __new__(cls) -> 'NaType':
|
||||
"""Gets the singleton instance (:const:`nvitop.NA`)."""
|
||||
|
||||
"""Get the singleton instance (:const:`nvitop.NA`)."""
|
||||
if not hasattr(cls, '_instance'):
|
||||
cls._instance = super().__new__(cls, 'N/A')
|
||||
return cls._instance
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
"""Converts :const:`NA` to :class:`bool`. Returns :data:`False`.
|
||||
"""Convert :const:`NA` to :class:`bool` and return :data:`False`.
|
||||
|
||||
>>> bool(NA)
|
||||
False
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
def __int__(self) -> int:
|
||||
"""Converts :const:`NA` to :class:`int`. Returns :const:`0`.
|
||||
"""Convert :const:`NA` to :class:`int` and return :const:`0`.
|
||||
|
||||
>>> int(NA)
|
||||
0
|
||||
"""
|
||||
|
||||
return 0
|
||||
|
||||
def __float__(self) -> float:
|
||||
"""Converts :const:`NA` to :class:`float`. Returns :data:`math.nan`.
|
||||
"""Convert :const:`NA` to :class:`float` and return :data:`math.nan`.
|
||||
|
||||
>>> float(NA)
|
||||
nan
|
||||
>>> float(NA) is math.nan
|
||||
True
|
||||
"""
|
||||
|
||||
return math.nan
|
||||
|
||||
def __add__(self, other: object) -> Union[str, float]:
|
||||
""":const:`nvitop.NA` + other: Returns :data:`math.nan` if the operand is a number or uses
|
||||
string concatenation if the operand is a string. A special case is when the operand is
|
||||
:const:`nvitop.NA` itself, the result is :data:`math.nan` instead of :const:`'N/AN/A'`.
|
||||
"""Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``NA + other``).
|
||||
|
||||
A special case is when the operand is :const:`nvitop.NA` itself, the result is
|
||||
:data:`math.nan` instead of :const:`'N/AN/A'`.
|
||||
|
||||
>>> NA + ' str'
|
||||
'N/A str'
|
||||
|
|
@ -190,14 +183,13 @@ class NaType(str):
|
|||
nan
|
||||
>>> NA + 1.0
|
||||
nan
|
||||
"""
|
||||
|
||||
""" # pylint: disable=line-too-long
|
||||
if isinstance(other, (int, float)) or other is NA:
|
||||
return float(self) + other
|
||||
return super().__add__(other)
|
||||
|
||||
def __radd__(self, other: object) -> Union[str, float]:
|
||||
"""other + :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``other + NA``).
|
||||
|
||||
>>> 'str' + NA
|
||||
'strN/A'
|
||||
|
|
@ -205,14 +197,13 @@ class NaType(str):
|
|||
nan
|
||||
>>> 1.0 + NA
|
||||
nan
|
||||
"""
|
||||
|
||||
""" # pylint: disable=line-too-long
|
||||
if isinstance(other, (int, float)):
|
||||
return other + float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __sub__(self, other: object) -> float:
|
||||
""":const:`nvitop.NA` - other: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``NA - other``).
|
||||
|
||||
>>> NA - 'str'
|
||||
TypeError: unsupported operand type(s) for -: 'NaType' and 'str'
|
||||
|
|
@ -223,13 +214,12 @@ class NaType(str):
|
|||
>>> NA + 1.0
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)) or other is NA:
|
||||
return float(self) - other
|
||||
return NotImplemented
|
||||
|
||||
def __rsub__(self, other: object) -> float:
|
||||
"""other - :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``other - NA``).
|
||||
|
||||
>>> 'str' - NA
|
||||
TypeError: unsupported operand type(s) for -: 'str' and 'NaType'
|
||||
|
|
@ -238,14 +228,14 @@ class NaType(str):
|
|||
>>> 1.0 - NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return other - float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __mul__(self, other: object) -> float:
|
||||
""":const:`nvitop.NA` * other: Returns :data:`math.nan` if the operand is a number. A special
|
||||
case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`.
|
||||
"""Return :data:`math.nan` if the operand is a number (``NA * other``).
|
||||
|
||||
A special case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`.
|
||||
|
||||
>>> NA * 1024
|
||||
nan
|
||||
|
|
@ -254,26 +244,24 @@ class NaType(str):
|
|||
>>> NA * NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)) or other is NA:
|
||||
return float(self) * other
|
||||
return NotImplemented
|
||||
|
||||
def __rmul__(self, other: object) -> float:
|
||||
"""other * :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``other * NA``).
|
||||
|
||||
>>> 1024 * NA
|
||||
nan
|
||||
>>> 1024.0 * NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return other * float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __truediv__(self, other: object) -> float:
|
||||
""":const:`nvitop.NA` / other: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``NA / other``).
|
||||
|
||||
>>> NA / 1024
|
||||
nan
|
||||
|
|
@ -284,26 +272,24 @@ class NaType(str):
|
|||
>>> NA / 0.0
|
||||
ZeroDivisionError: float division by zero
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return float(self) / other
|
||||
return NotImplemented
|
||||
|
||||
def __rtruediv__(self, other: object) -> float:
|
||||
"""other / :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``other / NA``).
|
||||
|
||||
>>> 1024 / NA
|
||||
nan
|
||||
>>> 1024.0 / NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return other / float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __floordiv__(self, other: object) -> float:
|
||||
""":const:`nvitop.NA` // other: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``NA // other``).
|
||||
|
||||
>>> NA // 1024
|
||||
nan
|
||||
|
|
@ -314,26 +300,24 @@ class NaType(str):
|
|||
>>> NA / 0.0
|
||||
ZeroDivisionError: float division by zero
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return float(self) // other
|
||||
return NotImplemented
|
||||
|
||||
def __rfloordiv__(self, other: object) -> float:
|
||||
"""other // :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``other // NA``).
|
||||
|
||||
>>> 1024 // NA
|
||||
nan
|
||||
>>> 1024.0 // NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return other // float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __mod__(self, other: object) -> float:
|
||||
""":const:`nvitop.NA` % other: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``NA % other``).
|
||||
|
||||
>>> NA % 1024
|
||||
nan
|
||||
|
|
@ -344,26 +328,24 @@ class NaType(str):
|
|||
>>> NA % 0.0
|
||||
ZeroDivisionError: float modulo
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return float(self) % other
|
||||
return NotImplemented
|
||||
|
||||
def __rmod__(self, other: object) -> float:
|
||||
"""other % :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number.
|
||||
"""Return :data:`math.nan` if the operand is a number (``other % NA``).
|
||||
|
||||
>>> 1024 % NA
|
||||
nan
|
||||
>>> 1024.0 % NA
|
||||
nan
|
||||
"""
|
||||
|
||||
if isinstance(other, (int, float)):
|
||||
return other % float(self)
|
||||
return NotImplemented
|
||||
|
||||
def __divmod__(self, other: object) -> Tuple[float, float]:
|
||||
"""divmod(:const:`nvitop.NA`, other): The pair (:const:`nvitop.NA` // other, :const:`nvitop.NA` % other).
|
||||
"""The pair ``(NA // other, NA % other)`` (``divmod(NA, other)``).
|
||||
|
||||
>>> divmod(NA, 1024)
|
||||
(nan, nan)
|
||||
|
|
@ -374,49 +356,44 @@ class NaType(str):
|
|||
>>> divmod(NA, 0.0)
|
||||
ZeroDivisionError: float floor division by zero
|
||||
"""
|
||||
|
||||
return (self // other, self % other)
|
||||
|
||||
def __rdivmod__(self, other: object) -> Tuple[float, float]:
|
||||
"""divmod(other, :const:`nvitop.NA`): The pair (other // :const:`nvitop.NA`, other % :const:`nvitop.NA`).
|
||||
"""The pair ``(other // NA, other % NA)`` (``divmod(other, NA)``).
|
||||
|
||||
>>> divmod(1024, NA)
|
||||
(nan, nan)
|
||||
>>> divmod(1024.0, NA)
|
||||
(nan, nan)
|
||||
"""
|
||||
|
||||
return (other // self, other % self)
|
||||
|
||||
def __pos__(self) -> float:
|
||||
"""+:const:`nvitop.NA`: Returns :data:`math.nan`.
|
||||
"""Return :data:`math.nan` (``+NA``).
|
||||
|
||||
>>> +NA
|
||||
nan
|
||||
"""
|
||||
|
||||
return +float(self)
|
||||
|
||||
def __neg__(self) -> float:
|
||||
"""+:const:`nvitop.NA`: Returns :data:`math.nan`.
|
||||
"""Return :data:`math.nan` (``-NA``).
|
||||
|
||||
>>> -NA
|
||||
nan
|
||||
"""
|
||||
|
||||
return -float(self)
|
||||
|
||||
def __abs__(self) -> float:
|
||||
"""abs(NA): Returns :data:`math.nan`.
|
||||
"""Return :data:`math.nan` (``abs(NA)``).
|
||||
|
||||
>>> abs(NA)
|
||||
nan
|
||||
"""
|
||||
|
||||
return abs(float(self))
|
||||
|
||||
def __round__(self, ndigits: Optional[int] = None) -> Union[int, float]:
|
||||
"""Rounds :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`.
|
||||
"""Round :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`.
|
||||
|
||||
If ``ndigits`` is omitted or :data:`None`, returns :const:`0`, otherwise returns :data:`math.nan`.
|
||||
|
||||
|
|
@ -427,40 +404,36 @@ class NaType(str):
|
|||
>>> round(NA, 1)
|
||||
nan
|
||||
"""
|
||||
|
||||
if ndigits is None:
|
||||
return int(self)
|
||||
return round(float(self), ndigits)
|
||||
|
||||
def __lt__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
|
||||
if isinstance(x, (int, float)):
|
||||
return False
|
||||
return super().__lt__(x)
|
||||
|
||||
def __le__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
|
||||
if isinstance(x, (int, float)):
|
||||
return False
|
||||
return super().__le__(x)
|
||||
|
||||
def __gt__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
|
||||
if isinstance(x, (int, float)):
|
||||
return True
|
||||
return super().__gt__(x)
|
||||
|
||||
def __ge__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
"""The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string."""
|
||||
if isinstance(x, (int, float)):
|
||||
return True
|
||||
return super().__ge__(x)
|
||||
|
||||
def __format__(self, format_spec: str) -> str:
|
||||
"""Format :const:`nvitop.NA` according to ``format_spec``."""
|
||||
try:
|
||||
return super().__format__(format_spec)
|
||||
except ValueError:
|
||||
|
|
@ -515,8 +488,7 @@ SIZE_PATTERN = re.compile(
|
|||
|
||||
|
||||
def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-many-return-statements
|
||||
"""Converts bytes to a human readable string."""
|
||||
|
||||
"""Convert bytes to a human readable string."""
|
||||
if b == NA:
|
||||
return NA
|
||||
|
||||
|
|
@ -546,14 +518,13 @@ def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-man
|
|||
|
||||
|
||||
def human2bytes(s: Union[int, str]) -> int:
|
||||
"""Converts a human readable size string (*case insensitive*) to bytes.
|
||||
"""Convert a human readable size string (*case insensitive*) to bytes.
|
||||
|
||||
Raises:
|
||||
ValueError:
|
||||
If cannot convert the given size string.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> human2bytes('500B')
|
||||
500
|
||||
>>> human2bytes('10k')
|
||||
|
|
@ -567,7 +538,6 @@ def human2bytes(s: Union[int, str]) -> int:
|
|||
>>> human2bytes('1.5GiB')
|
||||
1610612736
|
||||
"""
|
||||
|
||||
if isinstance(s, int):
|
||||
if s >= 0:
|
||||
return s
|
||||
|
|
@ -582,8 +552,7 @@ def human2bytes(s: Union[int, str]) -> int:
|
|||
|
||||
|
||||
def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str:
|
||||
"""Converts a number in seconds or a :class:`datetime.timedelta` instance to a human readable string."""
|
||||
|
||||
"""Convert a number in seconds or a :class:`datetime.timedelta` instance to a human readable string."""
|
||||
if isinstance(dt, (int, float)):
|
||||
dt = datetime.timedelta(seconds=dt)
|
||||
|
||||
|
|
@ -600,8 +569,7 @@ def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str:
|
|||
|
||||
|
||||
def utilization2string(utilization: Union[int, float, NaType]) -> str:
|
||||
"""Converts a utilization rate to string."""
|
||||
|
||||
"""Convert a utilization rate to string."""
|
||||
if utilization != NA:
|
||||
if isinstance(utilization, int):
|
||||
return f'{utilization}%'
|
||||
|
|
@ -611,8 +579,7 @@ def utilization2string(utilization: Union[int, float, NaType]) -> str:
|
|||
|
||||
|
||||
def boolify(string: str, default: Any = None) -> bool:
|
||||
"""Converts the given value, usually a string, to boolean."""
|
||||
|
||||
"""Convert the given value, usually a string, to boolean."""
|
||||
if string.lower() in ('true', 'yes', 'on', 'enabled', '1'):
|
||||
return True
|
||||
if string.lower() in ('false', 'no', 'off', 'disabled', '0'):
|
||||
|
|
@ -624,6 +591,7 @@ def boolify(string: str, default: Any = None) -> bool:
|
|||
|
||||
class Snapshot:
|
||||
"""A dict-like object holds the snapshot values.
|
||||
|
||||
The value can be accessed by ``snapshot.name`` or ``snapshot['name']`` syntax.
|
||||
The Snapshot can also be converted to a dictionary by ``dict(snapshot)`` or ``{**snapshot}``.
|
||||
|
||||
|
|
@ -631,12 +599,14 @@ class Snapshot:
|
|||
"""
|
||||
|
||||
def __init__(self, real: Any, **items) -> None:
|
||||
"""Initialize a new :class:`Snapshot` object with the given attributes."""
|
||||
self.real = real
|
||||
self.timestamp = time.time()
|
||||
for key, value in items.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the snapshot."""
|
||||
keys = set(self.__dict__.keys()).difference({'real', 'timestamp'})
|
||||
keys = ['real', *sorted(keys)]
|
||||
keyvals = []
|
||||
|
|
@ -653,13 +623,14 @@ class Snapshot:
|
|||
__repr__ = __str__
|
||||
|
||||
def __hash__(self) -> int:
|
||||
"""Return a hash value of the snapshot."""
|
||||
return hash((self.real, self.timestamp))
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""Gets a member from the instance.
|
||||
"""Get a member from the instance.
|
||||
|
||||
If the attribute is not defined, fetches from the original object and makes a function call.
|
||||
"""
|
||||
|
||||
try:
|
||||
return super().__getattr__(name)
|
||||
except AttributeError:
|
||||
|
|
@ -671,20 +642,18 @@ class Snapshot:
|
|||
return attribute
|
||||
|
||||
def __getitem__(self, name: str) -> Any:
|
||||
"""Supports ``snapshot['name']`` syntax."""
|
||||
|
||||
"""Support ``snapshot['name']`` syntax."""
|
||||
try:
|
||||
return getattr(self, name)
|
||||
except AttributeError as ex:
|
||||
raise KeyError(name) from ex
|
||||
|
||||
def __setitem__(self, name: str, value: Any) -> None:
|
||||
"""Supports ``snapshot['name'] = value`` syntax."""
|
||||
|
||||
"""Support ``snapshot['name'] = value`` syntax."""
|
||||
setattr(self, name, value)
|
||||
|
||||
def __iter__(self) -> Iterable[str]:
|
||||
"""Supports ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax."""
|
||||
"""Support ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax."""
|
||||
|
||||
def gen() -> str:
|
||||
for name in self.__dict__:
|
||||
|
|
@ -694,18 +663,17 @@ class Snapshot:
|
|||
return gen()
|
||||
|
||||
def keys(self) -> Iterable[str]:
|
||||
"""Supports `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and
|
||||
``dict(snapshot)`` dictionary conversion.
|
||||
"""
|
||||
|
||||
# pylint: disable-next=line-too-long
|
||||
"""Support `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and ``dict(snapshot)`` dictionary conversion."""
|
||||
return iter(self)
|
||||
|
||||
|
||||
# Modified from psutil (https://github.com/giampaolo/psutil)
|
||||
def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]:
|
||||
"""A memoize decorator which is disabled by default. It can be activated and
|
||||
deactivated on request. For efficiency reasons it can be used only against
|
||||
class methods accepting no arguments.
|
||||
"""A memoize decorator which is disabled by default.
|
||||
|
||||
It can be activated and deactivated on request. For efficiency reasons it can be used only
|
||||
against class methods accepting no arguments.
|
||||
"""
|
||||
|
||||
@functools.wraps(method)
|
||||
|
|
@ -729,10 +697,10 @@ def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]
|
|||
return ret
|
||||
|
||||
def cache_activate(self):
|
||||
"""Activate cache. Expects a Process instance. Cache will be stored as
|
||||
a "_cache" instance attribute.
|
||||
"""
|
||||
"""Activate cache.
|
||||
|
||||
Expects an instance. Cache will be stored as a "_cache" instance attribute.
|
||||
"""
|
||||
if not hasattr(self, '_cache'):
|
||||
setattr(self, '_cache', {})
|
||||
|
||||
|
|
|
|||
|
|
@ -54,8 +54,6 @@ Python API:
|
|||
)
|
||||
""" # pylint: disable=line-too-long
|
||||
|
||||
# pylint: disable=missing-function-docstring
|
||||
|
||||
import argparse
|
||||
import getpass
|
||||
import math
|
||||
|
|
@ -95,15 +93,13 @@ def select_devices(
|
|||
sort: bool = True,
|
||||
**kwargs, # fmt: skip # pylint: disable=unused-argument
|
||||
) -> Union[List[int], List[Tuple[int, int]], List[str]]:
|
||||
"""Selected a subset of devices satisfying the specified criteria. Returns a list of the device
|
||||
identifiers.
|
||||
"""Select a subset of devices satisfying the specified criteria.
|
||||
|
||||
Note:
|
||||
The *min count* constraint may not be satisfied if the no enough devices are available. This
|
||||
constraint is only enforced when there are both MIG and non-MIG devices present.
|
||||
|
||||
Examples:
|
||||
|
||||
Put the following lines to the top of your script:
|
||||
|
||||
.. code-block:: python
|
||||
|
|
@ -144,8 +140,10 @@ def select_devices(
|
|||
A list of accounts whose used GPU memory needs be considered as free memory.
|
||||
sort (bool):
|
||||
If :data:`True`, sort the selected devices by memory usage and GPU utilization.
|
||||
"""
|
||||
|
||||
Returns:
|
||||
A list of the device identifiers.
|
||||
"""
|
||||
assert format in ('index', 'uuid', 'device')
|
||||
assert tolerance >= 0
|
||||
tolerance = tolerance / 100.0
|
||||
|
|
@ -274,6 +272,8 @@ def select_devices(
|
|||
|
||||
|
||||
def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
||||
"""Parse command-line arguments for ``nvisel``."""
|
||||
|
||||
def non_negint(argstring):
|
||||
num = int(argstring)
|
||||
if num < 0:
|
||||
|
|
@ -490,6 +490,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
|||
|
||||
|
||||
def main():
|
||||
"""Main function for ``nvisel`` CLI."""
|
||||
args = parse_arguments()
|
||||
|
||||
try:
|
||||
|
|
|
|||
2
setup.py
2
setup.py
|
|
@ -9,7 +9,7 @@
|
|||
# pip install 'nvitop[pynvml-xx.yyy.zz]'
|
||||
#
|
||||
|
||||
# pylint: disable=missing-module-docstring
|
||||
"""Setup script for ``nvitop``."""
|
||||
|
||||
import pathlib
|
||||
import re
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue