From cb84fa8197bcfdb20a82bcc25d4e356b0219baf6 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 3 Jan 2023 10:20:27 +0000 Subject: [PATCH] docs(core): rephrase docstrings Signed-off-by: Xuehai Pan --- docs/source/spelling_wordlist.txt | 1 + nvitop/__main__.py | 2 +- nvitop/callbacks/keras.py | 43 ++-- nvitop/callbacks/pytorch_lightning.py | 42 +-- nvitop/callbacks/tensorboard.py | 5 +- nvitop/callbacks/utils.py | 3 +- nvitop/cli.py | 5 +- nvitop/core/collector.py | 190 +++++++------- nvitop/core/device.py | 353 +++++++++++--------------- nvitop/core/host.py | 19 +- nvitop/core/libcuda.py | 90 +++---- nvitop/core/libcudart.py | 59 ++--- nvitop/core/libnvml.py | 42 ++- nvitop/core/process.py | 145 ++++------- nvitop/core/utils.py | 152 +++++------ nvitop/select.py | 13 +- setup.py | 2 +- 17 files changed, 486 insertions(+), 680 deletions(-) diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index cd60164..178d405 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -132,3 +132,4 @@ lol xx yyy zz +CLI diff --git a/nvitop/__main__.py b/nvitop/__main__.py index 924cc9f..f46c109 100644 --- a/nvitop/__main__.py +++ b/nvitop/__main__.py @@ -1,7 +1,7 @@ # This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. -# pylint: disable=missing-module-docstring +"""The interactive NVIDIA-GPU process viewer.""" import sys diff --git a/nvitop/callbacks/keras.py b/nvitop/callbacks/keras.py index f4c3fad..c543df2 100644 --- a/nvitop/callbacks/keras.py +++ b/nvitop/callbacks/keras.py @@ -32,28 +32,25 @@ from nvitop.core import libnvml # Ported version of .pytorch_lightning.GpuStatsLogger for Keras class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes - r""" - Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a - callback and in order to use it you need to assign a TensorBoard callback or - a CSVLogger callback to the model. + """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and + in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model. Args: memory_utilization (bool): - Set to :data:`True` to log used, free and the percentage of memory - utilization at the start and end of each step. Default: :data:`True`. + Set to :data:`True` to log used, free and the percentage of memory utilization at the + start and end of each step. Default: :data:`True`. gpu_utilization (bool): - Set to :data:`True` to log the percentage of GPU utilization - at the start and end of each step. Default: :data:`True`. + Set to :data:`True` to log the percentage of GPU utilization at the start and end of + each step. Default: :data:`True`. intra_step_time (bool): Set to :data:`True` to log the time of each step. Default: :data:`False`. inter_step_time (bool): - Set to :data:`True` to log the time between the end of one step - and the start of the next step. Default: :data:`False`. + Set to :data:`True` to log the time between the end of one step and the start of the + next step. Default: :data:`False`. fan_speed (bool): Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. temperature (bool): - Set to :data:`True` to log the gpu temperature in degree Celsius. - Default: :data:`False`. + Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. Raises: ValueError: @@ -77,16 +74,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes GPU stats are mainly based on NVML queries. The description of the queries is as follows: - - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently - intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. - If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. - Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. + - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is + currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the + intended fan speed. If the fan is physically blocked and unable to spin, this output will not + match the actual fan speed. Many parts do not report fan speeds because they rely on cooling + via fans in the surrounding enclosure. - **memory.used** - Total memory allocated by active contexts, in MiBs. - **memory.free** - Total free memory, in MiBs. - - **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was - executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - - **utilization.memory** - Percent of time over the past sample period during which global (device) memory was - being read or written. The sample period may be between 1 second and 1/6 second depending on the product. + - **utilization.gpu** - Percent of time over the past sample period during which one or more + kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second + depending on the product. + - **utilization.memory** - Percent of time over the past sample period during which global + (device) memory was being read or written. The sample period may be between 1 second and 1/6 + second depending on the product. - **temperature** - Core GPU temperature, in degrees C. """ @@ -167,8 +167,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes ) def _get_gpu_stats(self) -> Dict[str, float]: - """Get the gpu status from NVML queries""" - + """Get the gpu status from NVML queries.""" return get_gpu_stats( devices=self._devices, memory_utilization=self._memory_utilization, diff --git a/nvitop/callbacks/pytorch_lightning.py b/nvitop/callbacks/pytorch_lightning.py index b42036e..44df450 100644 --- a/nvitop/callbacks/pytorch_lightning.py +++ b/nvitop/callbacks/pytorch_lightning.py @@ -33,27 +33,25 @@ from nvitop.core import libnvml # Modified from pytorch_lightning.callbacks.GPUStatsMonitor class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes - r""" - Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a - callback and in order to use it you need to assign a logger in the ``Trainer``. + """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and + in order to use it you need to assign a logger in the ``Trainer``. Args: memory_utilization (bool): - Set to :data:`True` to log used, free and the percentage of memory - utilization at the start and end of each step. Default: :data:`True`. + Set to :data:`True` to log used, free and the percentage of memory utilization at the + start and end of each step. Default: :data:`True`. gpu_utilization (bool): - Set to :data:`True` to log the percentage of GPU utilization - at the start and end of each step. Default: :data:`True`. + Set to :data:`True` to log the percentage of GPU utilization at the start and end of + each step. Default: :data:`True`. intra_step_time (bool): Set to :data:`True` to log the time of each step. Default: :data:`False`. inter_step_time (bool): - Set to :data:`True` to log the time between the end of one step - and the start of the next step. Default: :data:`False`. + Set to :data:`True` to log the time between the end of one step and the start of the + next step. Default: :data:`False`. fan_speed (bool): Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. temperature (bool): - Set to :data:`True` to log the gpu temperature in degree Celsius. - Default: :data:`False`. + Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. Raises: MisconfigurationException: @@ -68,16 +66,19 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes GPU stats are mainly based on NVML queries. The description of the queries is as follows: - - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is currently - intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. - If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. - Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. + - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is + currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the + intended fan speed. If the fan is physically blocked and unable to spin, this output will not + match the actual fan speed. Many parts do not report fan speeds because they rely on cooling + via fans in the surrounding enclosure. - **memory.used** - Total memory allocated by active contexts, in MiBs. - **memory.free** - Total free memory, in MiBs. - - **utilization.gpu** - Percent of time over the past sample period during which one or more kernels was - executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - - **utilization.memory** - Percent of time over the past sample period during which global (device) memory was - being read or written. The sample period may be between 1 second and 1/6 second depending on the product. + - **utilization.gpu** - Percent of time over the past sample period during which one or more + kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second + depending on the product. + - **utilization.memory** - Percent of time over the past sample period during which global + (device) memory was being read or written. The sample period may be between 1 second and 1/6 + second depending on the product. - **temperature** - Core GPU temperature, in degrees C. """ @@ -161,8 +162,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes trainer.logger.log_metrics(logs, step=trainer.global_step) def _get_gpu_stats(self) -> Dict[str, float]: - """Get the gpu status from NVML queries""" - + """Get the gpu status from NVML queries.""" return get_gpu_stats( devices=self._devices, memory_utilization=self._memory_utilization, diff --git a/nvitop/callbacks/tensorboard.py b/nvitop/callbacks/tensorboard.py index a109049..9e90d28 100644 --- a/nvitop/callbacks/tensorboard.py +++ b/nvitop/callbacks/tensorboard.py @@ -19,6 +19,9 @@ def add_scalar_dict(writer, main_tag, tag_scalar_dict, global_step=None, walltime=None): - """Batched version of `writer.add_scalar`""" + """Add a batch of scalars to the writer. + + Batched version of ``writer.add_scalar``. + """ for tag, scalar in tag_scalar_dict.items(): writer.add_scalar(f'{main_tag}/{tag}', scalar, global_step=global_step, walltime=walltime) diff --git a/nvitop/callbacks/utils.py b/nvitop/callbacks/utils.py index 4814e4b..dea74ce 100644 --- a/nvitop/callbacks/utils.py +++ b/nvitop/callbacks/utils.py @@ -43,8 +43,7 @@ def get_gpu_stats( fan_speed: bool = False, temperature: bool = False, ) -> Dict[str, float]: - """Get the GPU status from NVML queries""" - + """Get the GPU status from NVML queries.""" stats = {} for device in devices: prefix = f'gpu_id: {device.cuda_index}' diff --git a/nvitop/cli.py b/nvitop/cli.py index e2e756a..c01f8da 100644 --- a/nvitop/cli.py +++ b/nvitop/cli.py @@ -1,7 +1,7 @@ # This file is part of nvitop, the interactive NVIDIA-GPU process viewer. # License: GNU GPL version 3. -# pylint: disable=missing-module-docstring,missing-function-docstring +"""The interactive NVIDIA-GPU process viewer.""" import argparse import curses @@ -20,7 +20,7 @@ NVITOP_MONITOR_MODE = set( def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements - + """Parse command-line arguments for ``nvtiop``.""" coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format( colored('light', 'green'), colored('moderate', 'yellow'), colored('heavy', 'red') ) @@ -252,6 +252,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-locals + """Main function for ``nvitop`` CLI.""" args = parse_arguments() if args.force_color: diff --git a/nvitop/core/collector.py b/nvitop/core/collector.py index 41db881..b928993 100644 --- a/nvitop/core/collector.py +++ b/nvitop/core/collector.py @@ -15,7 +15,7 @@ # limitations under the License. # ============================================================================== -# pylint: disable=missing-module-docstring +"""Resource metrics collectors.""" import contextlib import itertools @@ -44,24 +44,27 @@ class SnapshotResult(NamedTuple): # pylint: disable=missing-class-docstring timer = time.monotonic +def _unique(iterable: Iterable[Hashable]) -> List[Hashable]: + return list(OrderedDict.fromkeys(iterable).keys()) + + # pylint: disable-next=too-many-branches def take_snapshots( devices: Optional[Union[Device, Iterable[Device]]] = None, *, gpu_processes: Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]] = None, ) -> SnapshotResult: - """Retrieves status of demanded devices and GPU processes. + """Retrieve status of demanded devices and GPU processes. Args: devices (Optional[Union[Device, Iterable[Device]]]): - Requested devices for snapshots. If not given, the devices will be - determined from GPU processes: - - All devices (no GPU processes are given) - - Devices that used by given GPU processes + Requested devices for snapshots. If not given, the devices will be determined from GPU + processes: **(1)** All devices (no GPU processes are given); **(2)** Devices that used + by given GPU processes. gpu_processes (Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]]): - Requested GPU processes snapshots. If not given, all GPU processes - running on the requested device will be returned. The GPU process - snapshots can be suppressed by specifying ``gpu_processes=False``. + Requested GPU processes snapshots. If not given, all GPU processes running on the + requested device will be returned. The GPU process snapshots can be suppressed by + specifying ``gpu_processes=False``. Returns: SnapshotResult A named tuple containing two lists of snapshots. @@ -71,7 +74,6 @@ def take_snapshots( be returned. Examples: - >>> from nvitop import take_snapshots, Device >>> import os >>> os.environ['CUDA_VISIBLE_DEVICES'] = '1,0' @@ -136,10 +138,6 @@ def take_snapshots( ] ) """ # pylint: disable=line-too-long - - def unique(iterable: Iterable[Hashable]) -> List[Hashable]: - return list(OrderedDict.fromkeys(iterable).keys()) - if isinstance(devices, Device): devices = [devices] if isinstance(gpu_processes, GpuProcess): @@ -148,7 +146,7 @@ def take_snapshots( if gpu_processes is not None: if gpu_processes: # is not False or is a non-empty list/tuple gpu_processes = list(gpu_processes) - process_devices = unique(process.device for process in gpu_processes) + process_devices = _unique(process.device for process in gpu_processes) for device in process_devices: device.processes() # update GPU status for requested GPU processes if devices is None: @@ -193,57 +191,55 @@ def collect_in_background( tag: str = 'metrics-daemon', start: bool = True, ) -> threading.Thread: - """Starts a background daemon thread that collect and call the callback function periodically. + """Start a background daemon thread that collect and call the callback function periodically. See also :func:`ResourceMetricCollector.daemonize`. Args: - on_collect: (Callable[[Dict[str, float]], bool]) + on_collect (Callable[[Dict[str, float]], bool]): A callback function that will be called periodically. It takes a dictionary containing the resource metrics and returns a boolean indicating whether to continue monitoring. - collector: (Optional[ResourceMetricCollector]) + collector (Optional[ResourceMetricCollector]): A :class:`ResourceMetricCollector` instance to collect metrics. If not given, it will collect metrics for all GPUs and subprocess of the current process. - interval: (Optional[float]) + interval (Optional[float]): The collect interval. If not given, use ``collector.interval``. - on_start: (Optional[Callable[['ResourceMetricCollector'], None]]) + on_start (Optional[Callable[[ResourceMetricCollector], None]]): A function to initialize the daemon thread and collector. - on_stop: (Optional[Callable[['ResourceMetricCollector'], None]]) + on_stop (Optional[Callable[[ResourceMetricCollector], None]]): A function that do some necessary cleanup after the daemon thread is stopped. - tag: (str) + tag (str): The tag prefix used for metrics results. - start: (bool) + start (bool): Whether to start the daemon thread on return. Returns: threading.Thread A daemon thread object. Examples: + .. code-block:: python - .. code-block:: python + logger = ... - logger = ... + def on_collect(metrics): # will be called periodically + if logger.is_closed(): # closed manually by user + return False + logger.log(metrics) + return True - def on_collect(metrics): # will be called periodically - if logger.is_closed(): # closed manually by user - return False - logger.log(metrics) - return True + def on_stop(collector): # will be called only once at stop + if not logger.is_closed(): + logger.close() # cleanup - def on_stop(collector): # will be called only once at stop - if not logger.is_closed(): - logger.close() # cleanup - - # Record metrics to the logger in background every 5 seconds. - # It will collect 5-second mean/min/max for each metric. - collect_in_background( - on_collect, - ResourceMetricCollector(Device.cuda.all()), - interval=5.0, - on_stop=on_stop, - ) + # Record metrics to the logger in background every 5 seconds. + # It will collect 5-second mean/min/max for each metric. + collect_in_background( + on_collect, + ResourceMetricCollector(Device.cuda.all()), + interval=5.0, + on_stop=on_stop, + ) """ - if collector is None: collector = ResourceMetricCollector() if isinstance(interval, (int, float)) and interval > 0: @@ -282,13 +278,13 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes Args: devices (Iterable[Device]): - Set of Device instances for logging. If not given, all physical - devices on board will be used. + Set of Device instances for logging. If not given, all physical devices on board will be + used. root_pids (Set[int]): - A set of PIDs, only the status of the descendant processes on the - GPUs will be collected. If not given, the PID of the current process - will be used. - interval (float): The snapshot interval for background daemon thread. + A set of PIDs, only the status of the descendant processes on the GPUs will be collected. + If not given, the PID of the current process will be used. + interval (float): + The snapshot interval for background daemon thread. Core methods: @@ -305,7 +301,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes collector.daemonize(on_collect_fn) Examples: - >>> import os >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' @@ -398,6 +393,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes root_pids: Optional[Iterable[int]] = None, interval: Union[int, float] = 1.0, ) -> None: + """Initialize the resource metric collector.""" if isinstance(interval, (int, float)) and interval > 0: interval = float(interval) else: @@ -440,15 +436,14 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes self._daemon_running = threading.Event() def activate(self, tag: str) -> 'ResourceMetricCollector': - """Starts a new metric collection with the given tag. + """Start a new metric collection with the given tag. Args: tag (str): - The name of the new metric collection. The tag will be used to - identify the metric collection. It must be a unique string. + The name of the new metric collection. The tag will be used to identify the metric + collection. It must be a unique string. Examples: - >>> collector = ResourceMetricCollector() >>> collector.activate(tag='train') # key prefix -> 'train' @@ -457,7 +452,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes >>> collector.deactivate() # the collector has been stopped >>> collector.activate(tag='test') # key prefix -> 'test' """ - with self._lock: if self._metric_buffer is None or tag not in self._tags: self._tags.add(tag) @@ -477,11 +471,15 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes start = activate def deactivate(self, tag: Optional[str] = None) -> 'ResourceMetricCollector': - """Stops the current collection with the given tag and remove all sub-tags. - If the tag is not specified, deactivate the current active collection. - For nested collections, the sub-collections will be deactivated as well. - """ + """Stop the current collection with the given tag and remove all sub-tags. + If the tag is not specified, deactivate the current active collection. For nested + collections, the sub-collections will be deactivated as well. + + Args: + tag (Optional[str]): + The tag to deactivate. If :data:`None`, the current active collection will be used. + """ with self._lock: if self._metric_buffer is None: if tag is not None: @@ -516,18 +514,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes Args: tag (str): - The name of the new metric collection. The tag will be used to - identify the metric collection. It must be a unique string. + The name of the new metric collection. The tag will be used to identify the metric + collection. It must be a unique string. Examples: - >>> collector = ResourceMetricCollector() >>> with collector.context(tag='train'): # key prefix -> 'train' ... # Do something ... collector.collect() # -> Dict[str, float] """ - try: self.activate(tag=tag) yield self @@ -537,17 +533,16 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes __call__ = context # alias for `with collector(tag='')` def clear(self, tag: Optional[str] = None) -> None: - """Resets the metric collection with the given tag. If the tag is not - specified, reset the current active collection. For nested collections, + """Reset the metric collection with the given tag. + + If the tag is not specified, reset the current active collection. For nested collections, the sub-collections will be reset as well. Args: tag (Optional[str]): - The tag to reset. If None, the current active collection - will be reset. + The tag to reset. If :data:`None`, the current active collection will be reset. Examples: - >>> collector = ResourceMetricCollector() >>> with collector(tag='train'): # key prefix -> 'train' @@ -564,7 +559,6 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes ... with collector(tag='batch'): # key prefix -> 'train/batch' ... collector.reset(tag='train') # reset both 'train' and 'train/batch' """ - with self._lock: if self._metric_buffer is None: if tag is not None: @@ -586,8 +580,7 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes buffer = buffer.prev def collect(self) -> Dict[str, float]: - """Gets the average resource consumption during collection.""" - + """Get the average resource consumption during collection.""" with self._lock: if self._metric_buffer is None: raise RuntimeError('Resource metric collector has not been not started yet.') @@ -607,52 +600,51 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes tag: str = 'metrics-daemon', start: bool = True, ) -> threading.Thread: - """Starts a background daemon thread that collect and call the callback function periodically. + """Start a background daemon thread that collect and call the callback function periodically. See also :func:`collect_in_background`. Args: - on_collect: (Callable[[Dict[str, float]], bool]) + on_collect (Callable[[Dict[str, float]], bool]): A callback function that will be called periodically. It takes a dictionary containing the resource metrics and returns a boolean indicating whether to continue monitoring. - interval: (Optional[float]) + interval (Optional[float]): The collect interval. If not given, use ``collector.interval``. - on_start: (Optional[Callable[['ResourceMetricCollector'], None]]) + on_start (Optional[Callable[[ResourceMetricCollector], None]]): A function to initialize the daemon thread and collector. - on_stop: (Optional[Callable[['ResourceMetricCollector'], None]]) + on_stop (Optional[Callable[[ResourceMetricCollector], None]]): A function that do some necessary cleanup after the daemon thread is stopped. - tag: (str) + tag (str): The tag prefix used for metrics results. - start: (bool) + start (bool): Whether to start the daemon thread on return. Returns: threading.Thread A daemon thread object. Examples: + .. code-block:: python - .. code-block:: python + logger = ... - logger = ... + def on_collect(metrics): # will be called periodically + if logger.is_closed(): # closed manually by user + return False + logger.log(metrics) + return True - def on_collect(metrics): # will be called periodically - if logger.is_closed(): # closed manually by user - return False - logger.log(metrics) - return True + def on_stop(collector): # will be called only once at stop + if not logger.is_closed(): + logger.close() # cleanup - def on_stop(collector): # will be called only once at stop - if not logger.is_closed(): - logger.close() # cleanup - - # Record metrics to the logger in background every 5 seconds. - # It will collect 5-second mean/min/max for each metric. - ResourceMetricCollector(Device.cuda.all()).daemonize( - on_collect, - ResourceMetricCollector(Device.cuda.all()), - interval=5.0, - on_stop=on_stop, - ) + # Record metrics to the logger in background every 5 seconds. + # It will collect 5-second mean/min/max for each metric. + ResourceMetricCollector(Device.cuda.all()).daemonize( + on_collect, + ResourceMetricCollector(Device.cuda.all()), + interval=5.0, + on_stop=on_stop, + ) """ return collect_in_background( on_collect, @@ -665,10 +657,12 @@ class ResourceMetricCollector: # pylint: disable=too-many-instance-attributes ) def __del__(self) -> None: + """Clean up the demon thread on destruction.""" self._daemon_running.clear() - # pylint: disable-next=missing-function-docstring,too-many-branches,too-many-locals,too-many-statements + # pylint: disable-next=too-many-branches,too-many-locals,too-many-statements def take_snapshots(self) -> SnapshotResult: + """Take snapshots of the current resource metrics and update the metric buffer.""" if len(self.root_pids) > 0: all_gpu_processes = [] for device in self.leaf_devices: diff --git a/nvitop/core/device.py b/nvitop/core/device.py index 39fb0ac..ef1a068 100644 --- a/nvitop/core/device.py +++ b/nvitop/core/device.py @@ -38,7 +38,6 @@ The type of returned instance created by ``Class(args)`` is depending on the giv - (nvml_index: (int, int)) -> CudaMigDevice Examples: - >>> from nvitop import Device, CudaDevice >>> Device.driver_version() # version of the installed NVIDIA display driver '470.129.06' @@ -171,7 +170,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me - (bus_id: str) -> PhysicalDevice Examples: - >>> Device.driver_version() # version of the installed NVIDIA display driver '470.129.06' @@ -247,8 +245,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @classmethod def is_available(cls) -> bool: - """Returns whether there are any devices and the NVML library is successfully loaded.""" - + """Test whether there are any devices and the NVML library is successfully loaded.""" try: return cls.count() > 0 except libnvml.NVMLError: @@ -273,7 +270,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - return libnvml.nvmlQuery('nvmlSystemGetDriverVersion') @staticmethod @@ -294,7 +290,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion') if libnvml.nvmlCheckReturn(cuda_driver_version, int): major = cuda_driver_version // 1000 @@ -317,7 +312,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me The CUDA Runtime version, or :const:`nvitop.NA` when no CUDA Runtime is available or no CUDA-capable devices are present. """ - try: return libcudart.cudaRuntimeGetVersion() except libcudart.cudaError: @@ -344,20 +338,18 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ - return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0) @classmethod def all(cls) -> List['PhysicalDevice']: - """Returns a list of all physical devices in the system.""" - + """Return a list of all physical devices in the system.""" return cls.from_indices() @classmethod def from_indices( cls, indices: Optional[Union[int, Iterable[Union[int, Tuple[int, int]]]]] = None ) -> List[Union['PhysicalDevice', 'MigDevice']]: - """Returns a list of devices of the given indices. + """Return a list of devices of the given indices. Args: indices (Iterable[Union[int, Tuple[int, int]]]): @@ -382,7 +374,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ - if indices is None: try: indices = range(cls.count()) @@ -396,7 +387,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @staticmethod def from_cuda_visible_devices() -> List['CudaDevice']: - """Returns a list of all CUDA visible devices. + """Return a list of all CUDA visible devices. + The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. Note: @@ -409,7 +401,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: List[CudaDevice] A list of :class:`CudaDevice` instances. """ # pylint: disable=line-too-long - visible_device_indices = Device.parse_cuda_visible_devices() cuda_devices = [] @@ -418,13 +409,12 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return cuda_devices - cuda_all = from_cuda_visible_devices - @staticmethod def from_cuda_indices( cuda_indices: Optional[Union[int, Iterable[int]]] = None ) -> List['CudaDevice']: - """Returns a list of CUDA devices of the given CUDA indices. + """Return a list of CUDA devices of the given CUDA indices. + The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: @@ -449,7 +439,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long - cuda_devices = Device.from_cuda_visible_devices() if cuda_indices is None: return cuda_devices @@ -473,8 +462,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me def parse_cuda_visible_devices( cuda_visible_devices: Optional[str] = _VALUE_OMITTED, ) -> Union[List[int], List[Tuple[int, int]]]: - """Parses the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. - Alias of :func:`parse_cuda_visible_devices`. + """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. + + This is a alias of :func:`parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. @@ -493,13 +483,13 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me A list of int (physical device) or a list of tuple of two ints (MIG device) for the corresponding real device indices. """ # pylint: disable=line-too-long - return parse_cuda_visible_devices(cuda_visible_devices) @staticmethod def normalize_cuda_visible_devices(cuda_visible_devices: Optional[str] = _VALUE_OMITTED) -> str: - """Parses the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. - Alias of :func:`normalize_cuda_visible_devices`. + """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. + + This is an alias of :func:`normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. @@ -526,7 +516,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me uuid: Optional[str] = None, bus_id: Optional[str] = None, ) -> 'Device': - """Creates a new instance of Device. The type of the result is determined by the given argument. + """Create a new instance of Device. + + The type of the result is determined by the given argument. .. code-block:: python @@ -546,7 +538,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me TypeError: If the given index is a tuple but is not consist of two integers. """ - if (index, uuid, bus_id).count(None) != 2: raise TypeError( 'Device(index=None, uuid=None, bus_id=None) takes 1 non-None arguments ' @@ -589,7 +580,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me uuid: Optional[str] = None, bus_id: Optional[str] = None, ) -> None: - """Initializes the instance created by :meth:`__new__()`. + """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: @@ -604,7 +595,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ - if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index @@ -662,6 +652,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me self._hash = None def __str__(self) -> str: + """Return a string representation of the device.""" return '{}(index={}, name="{}", total_memory={})'.format( self.__class__.__name__, self.index, self.name(), self.memory_total_human() ) @@ -669,14 +660,13 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me __repr__ = __str__ def __eq__(self, other: object) -> bool: + """Test equality to other object.""" if not isinstance(other, Device): return NotImplemented return self._ident == other._ident - def __ne__(self, other: object) -> bool: - return not self == other - def __hash__(self) -> int: + """Return a hash value of the device.""" if self._hash is None: self._hash = hash(self._ident) return self._hash @@ -692,7 +682,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me If the attribute is not defined in ``pynvml.py``. Examples: - >>> device = Device(0) >>> # Method `cuda_compute_capability` is not implemented in the class definition @@ -706,7 +695,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me >>> device.cuda_compute_capability() (8, 6) """ # pylint: disable=line-too-long - try: return super().__getattr__(name) except AttributeError: @@ -746,6 +734,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return attribute def __reduce__(self) -> Tuple[Type['Device'], Tuple[Union[int, Tuple[int, int]]]]: + """Return state information for pickling.""" return self.__class__, (self._nvml_index,) @property @@ -755,7 +744,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ - return self._nvml_index @property @@ -765,7 +753,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ - return self._nvml_index @property @@ -776,25 +763,24 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me An int for the physical device index. For MIG devices, returns the index of the parent physical device. """ - return self._nvml_index # will be overridden in MigDevice @property def handle(self) -> libnvml.c_nvmlDevice_t: """The NVML device handle.""" - return self._handle @property def cuda_index(self) -> int: - """The CUDA device index. The value will be evaluated on the first call. + """The CUDA device index. + + The value will be evaluated on the first call. Raises: RuntimeError: If the current device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ - if self._cuda_index is None: visible_device_indices = self.parse_cuda_visible_devices() try: @@ -822,14 +808,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ - if self._name is NA: self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self.handle) return self._name def uuid(self) -> Union[str, NaType]: - """This value is the globally unique immutable alphanumeric identifier of the GPU. It does - not correspond to any physical label on the board. + """This value is the globally unique immutable alphanumeric identifier of the GPU. + + It does not correspond to any physical label on the board. Returns: Union[str, NaType] The UUID of the device, or :const:`nvitop.NA` when not applicable. @@ -840,7 +826,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=name """ - if self._uuid is NA: self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self.handle) return self._uuid @@ -857,7 +842,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pci.bus_id """ - if self._bus_id is NA: self._bus_id = libnvml.nvmlQuery( lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId, self.handle @@ -865,8 +849,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return self._bus_id def serial(self) -> Union[str, NaType]: - """This number matches the serial number physically printed on each board. It is a globally - unique immutable alphanumeric value. + """This number matches the serial number physically printed on each board. + + It is a globally unique immutable alphanumeric value. Returns: Union[str, NaType] The serial number of the device, or :const:`nvitop.NA` when not applicable. @@ -877,18 +862,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=serial """ - return libnvml.nvmlQuery('nvmlDeviceGetSerial', self.handle) @memoize_when_activated @ttl_cache(ttl=1.0) def memory_info(self) -> MemoryInfo: # in bytes - """Returns a named tuple with memory information (in bytes) for the device. + """Return a named tuple with memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable. """ - memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self.handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo(total=memory_info.total, free=memory_info.free, used=memory_info.used) @@ -906,7 +889,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.total """ - if self._memory_total is NA: self._memory_total = self.memory_info().total return self._memory_total @@ -923,7 +905,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.used """ - return self.memory_info().used def memory_free(self) -> Union[int, NaType]: # in bytes @@ -938,7 +919,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=memory.free """ - return self.memory_info().free def memory_total_human(self) -> Union[str, NaType]: # in human readable @@ -947,7 +927,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[str, NaType] Total installed GPU memory in human readable format, or :const:`nvitop.NA` when not applicable. """ - if self._memory_total_human is NA: self._memory_total_human = bytes2human(self.memory_total()) return self._memory_total_human @@ -958,7 +937,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total memory allocated by active contexts in human readable format, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - return bytes2human(self.memory_used()) def memory_free_human(self) -> Union[str, NaType]: # in human readable @@ -967,16 +945,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total free memory in human readable format, or :const:`nvitop.NA` when not applicable. """ - return bytes2human(self.memory_free()) def memory_percent(self) -> Union[float, NaType]: # in percentage - """The percentage of used memory over total memory (0 <= p <= 100). + """The percentage of used memory over total memory (``0 <= p <= 100``). Returns: Union[float, NaType] The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable. """ - memory_info = self.memory_info() if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn( memory_info.total, int @@ -990,18 +966,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: str The used memory over total memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long - return f'{self.memory_used_human()} / {self.memory_total_human()}' @memoize_when_activated @ttl_cache(ttl=1.0) def bar1_memory_info(self) -> MemoryInfo: # in bytes - """Returns a named tuple with BAR1 memory information (in bytes) for the device. + """Return a named tuple with BAR1 memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with BAR1 memory information, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - memory_info = libnvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self.handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo( @@ -1015,7 +989,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ - return self.bar1_memory_info().total def bar1_memory_used(self) -> Union[int, NaType]: # in bytes @@ -1024,7 +997,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total used BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ - return self.bar1_memory_info().used def bar1_memory_free(self) -> Union[int, NaType]: # in bytes @@ -1033,7 +1005,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total free BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ - return self.bar1_memory_info().free def bar1_memory_total_human(self) -> Union[str, NaType]: # in human readable @@ -1042,7 +1013,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ - return bytes2human(self.bar1_memory_total()) def bar1_memory_used_human(self) -> Union[str, NaType]: # in human readable @@ -1051,7 +1021,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total used BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ - return bytes2human(self.bar1_memory_used()) def bar1_memory_free_human(self) -> Union[str, NaType]: # in human readable @@ -1060,7 +1029,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] Total free BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ - return bytes2human(self.bar1_memory_free()) def bar1_memory_percent(self) -> Union[float, NaType]: # in percentage @@ -1069,7 +1037,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[float, NaType] The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - memory_info = self.bar1_memory_info() if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn( memory_info.total, int @@ -1083,18 +1050,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: str The used BAR1 memory over total BAR1 memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long - return f'{self.bar1_memory_used_human()} / {self.bar1_memory_total_human()}' @memoize_when_activated @ttl_cache(ttl=1.0) def utilization_rates(self) -> UtilizationRates: # in percentage - """Returns a named tuple with GPU utilization rates (in percentage) for the device. + """Return a named tuple with GPU utilization rates (in percentage) for the device. Returns: UtilizationRates(gpu, memory, encoder, decoder) A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - gpu, memory, encoder, decoder = NA, NA, NA, NA utilization_rates = libnvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self.handle) @@ -1113,6 +1078,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me def gpu_utilization(self) -> Union[int, NaType]: # in percentage """Percent of time over the past sample period during which one or more kernels was executing on the GPU. + The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] @@ -1124,13 +1090,13 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=utilization.gpu """ - return self.utilization_rates().gpu gpu_percent = gpu_utilization # in percentage def memory_utilization(self) -> Union[float, NaType]: # in percentage """Percent of time over the past sample period during which global (device) memory was being read or written. + The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] @@ -1142,7 +1108,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=utilization.memory """ # pylint: disable=line-too-long - return self.utilization_rates().memory def encoder_utilization(self) -> Union[float, NaType]: # in percentage @@ -1151,7 +1116,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] The encoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ - return self.utilization_rates().encoder def decoder_utilization(self) -> Union[float, NaType]: # in percentage\ @@ -1160,18 +1124,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: Union[int, NaType] The decoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ - return self.utilization_rates().decoder @memoize_when_activated @ttl_cache(ttl=5.0) def clock_infos(self) -> ClockInfos: # in MHz - """Returns a named tuple with current clock speeds (in MHz) for the device. + """Return a named tuple with current clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with current clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - return ClockInfos( graphics=libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_GRAPHICS @@ -1188,12 +1150,11 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @memoize_when_activated @ttl_cache(ttl=5.0) def max_clock_infos(self) -> ClockInfos: # in MHz - """Returns a named tuple with maximum clock speeds (in MHz) for the device. + """Return a named tuple with maximum clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with maximum clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long - clock_infos = self._max_clock_infos._asdict() for name, clock in clock_infos.items(): if clock is NA: @@ -1208,12 +1169,11 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me max_clocks = max_clock_infos def clock_speed_infos(self) -> ClockSpeedInfos: # in MHz - """Returns a named tuple with the current and the maximum clock speeds (in MHz) for the device. + """Return a named tuple with the current and the maximum clock speeds (in MHz) for the device. Returns: ClockSpeedInfos(current, max) A named tuple with the current and the maximum clock speeds (in MHz) for the device. """ - return ClockSpeedInfos(current=self.clock_infos(), max=self.max_clock_infos()) def graphics_clock(self) -> Union[int, NaType]: # in MHz @@ -1228,7 +1188,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.graphics """ # pylint: disable=line-too-long - return self.clock_infos().graphics def sm_clock(self) -> Union[int, NaType]: # in MHz @@ -1243,7 +1202,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.sm """ # pylint: disable=line-too-long - return self.clock_infos().sm def memory_clock(self) -> Union[int, NaType]: # in MHz @@ -1258,7 +1216,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.memory """ - return self.clock_infos().memory def video_clock(self) -> Union[int, NaType]: # in MHz @@ -1273,7 +1230,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.current.video """ # pylint: disable=line-too-long - return self.clock_infos().video def max_graphics_clock(self) -> Union[int, NaType]: # in MHz @@ -1288,7 +1244,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.graphics """ # pylint: disable=line-too-long - return self.max_clock_infos().graphics def max_sm_clock(self) -> Union[int, NaType]: # in MHz @@ -1303,7 +1258,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.sm """ # pylint: disable=line-too-long - return self.max_clock_infos().sm def max_memory_clock(self) -> Union[int, NaType]: # in MHz @@ -1318,7 +1272,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.memory """ - return self.max_clock_infos().memory def max_video_clock(self) -> Union[int, NaType]: # in MHz @@ -1333,16 +1286,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=clocks.max.video """ # pylint: disable=line-too-long - return self.max_clock_infos().video @ttl_cache(ttl=5.0) def fan_speed(self) -> Union[int, NaType]: # in percentage - """The fan speed value is the percent of the product's maximum noise tolerance fan speed that - the device's fan is currently intended to run at. This value may exceed 100% in certain cases. - Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable - to spin, this output will not match the actual fan speed. Many parts do not report fan speeds - because they rely on cooling via fans in the surrounding enclosure. + """The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. + + This value may exceed 100% in certain cases. Note: The reported speed is the intended fan + speed. If the fan is physically blocked and unable to spin, this output will not match the + actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans + in the surrounding enclosure. Returns: Union[int, NaType] The fan speed value in percentage, or :const:`nvitop.NA` when not applicable. @@ -1352,13 +1305,12 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=fan.speed - """ - + """ # pylint: disable=line-too-long return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self.handle) @ttl_cache(ttl=5.0) def temperature(self) -> Union[int, NaType]: # in Celsius - """Core GPU temperature. in degrees C. + """Core GPU temperature in degrees C. Returns: Union[int, NaType] The core GPU temperature in Celsius degrees, or :const:`nvitop.NA` when not applicable. @@ -1369,7 +1321,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=temperature.gpu """ - return libnvml.nvmlQuery( 'nvmlDeviceGetTemperature', self.handle, libnvml.NVML_TEMPERATURE_GPU ) @@ -1388,7 +1339,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 )) """ - return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self.handle) power_draw = power_usage # in milliwatts (mW) @@ -1396,7 +1346,9 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @memoize_when_activated @ttl_cache(ttl=60.0) def power_limit(self) -> Union[int, NaType]: # in milliwatts (mW) - """The software power limit in milliwatts. Set by software like nvidia-smi. + """The software power limit in milliwatts. + + Set by software like nvidia-smi. Returns: Union[int, NaType] The software power limit in milliwatts, or :const:`nvitop.NA` when not applicable. @@ -1407,7 +1359,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me $(( "$(nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 )) """ - return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self.handle) def power_status(self) -> str: # string of power usage over power limit in watts (W) @@ -1416,7 +1367,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me Returns: str The string of power usage over power limit in watts, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long - power_usage = self.power_usage() power_limit = self.power_limit() if libnvml.nvmlCheckReturn(power_usage, int): @@ -1427,9 +1377,10 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @ttl_cache(ttl=60.0) def display_active(self) -> Union[str, NaType]: - """A flag that indicates whether a display is initialized on the GPU's (e.g. memory is - allocated on the device for display). Display can be active even when no monitor is - physically attached. "Enabled" indicates an active display. "Disabled" indicates otherwise. + """A flag that indicates whether a display is initialized on the GPU's (e.g. memory is allocated on the device for display). + + Display can be active even when no monitor is physically attached. "Enabled" indicates an + active display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if not an active display device. @@ -1441,17 +1392,16 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=display_active - """ - + """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetDisplayActive', self.handle), NA ) @ttl_cache(ttl=60.0) def display_mode(self) -> Union[str, NaType]: - """A flag that indicates whether a physical display (e.g. monitor) is currently connected to - any of the GPU's connectors. "Enabled" indicates an attached display. "Disabled" indicates - otherwise. + """A flag that indicates whether a physical display (e.g. monitor) is currently connected to any of the GPU's connectors. + + "Enabled" indicates an attached display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if the display mode is disabled. @@ -1463,19 +1413,20 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=display_mode - """ - + """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetDisplayMode', self.handle), NA ) @ttl_cache(ttl=60.0) def current_driver_model(self) -> Union[str, NaType]: - """The driver model currently in use. Always "N/A" on Linux. On Windows, the TCC (WDM) - and WDDM driver models are supported. The TCC driver model is optimized for compute - applications. I.E. kernel launch times will be quicker with TCC. The WDDM driver model - is designed for graphics applications and is not recommended for compute applications. - Linux does not support multiple driver models, and will always have the value of "N/A". + """The driver model currently in use. + + Always "N/A" on Linux. On Windows, the TCC (WDM) and WDDM driver models are supported. The + TCC driver model is optimized for compute applications. I.E. kernel launch times will be + quicker with TCC. The WDDM driver model is designed for graphics applications and is not + recommended for compute applications. Linux does not support multiple driver models, and + will always have the value of "N/A". Returns: Union[str, NaType] - :const:`'WDDM'`: for WDDM driver model on Windows. @@ -1488,7 +1439,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=driver_model.current """ - return {libnvml.NVML_DRIVER_WDDM: 'WDDM', libnvml.NVML_DRIVER_WDM: 'WDM'}.get( libnvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self.handle), NA ) @@ -1497,10 +1447,11 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @ttl_cache(ttl=60.0) def persistence_mode(self) -> Union[str, NaType]: - """A flag that indicates whether persistence mode is enabled for the GPU. Value is either - "Enabled" or "Disabled". When persistence mode is enabled the NVIDIA driver remains loaded - even when no active clients, such as X11 or nvidia-smi, exist. This minimizes the driver - load latency associated with running dependent apps, such as CUDA programs. Linux only. + """A flag that indicates whether persistence mode is enabled for the GPU. Value is either "Enabled" or "Disabled". + + When persistence mode is enabled the NVIDIA driver remains loaded even when no active + clients, such as X11 or nvidia-smi, exist. This minimizes the driver load latency associated + with running dependent apps, such as CUDA programs. Linux only. Returns: Union[str, NaType] - :const:`'Disabled'`: if the persistence mode is disabled. @@ -1512,16 +1463,14 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=persistence_mode - """ - + """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self.handle), NA ) @ttl_cache(ttl=5.0) def performance_state(self) -> Union[str, NaType]: - """The current performance state for the GPU. States range from P0 (maximum performance) to - P12 (minimum performance). + """The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance). Returns: Union[str, NaType] The current performance state in format ``P``, or :const:`nvitop.NA` when not applicable. @@ -1531,8 +1480,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me .. code:: bash nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=pstate - """ - + """ # pylint: disable=line-too-long performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self.handle) if libnvml.nvmlCheckReturn(performance_state, int): performance_state = 'P' + str(performance_state) @@ -1551,7 +1499,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=ecc.errors.uncorrected.volatile.total """ # pylint: disable=line-too-long - return libnvml.nvmlQuery( 'nvmlDeviceGetTotalEccErrors', self.handle, @@ -1561,8 +1508,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me @ttl_cache(ttl=60.0) def compute_mode(self) -> Union[str, NaType]: - """The compute mode flag indicates whether individual or multiple compute applications may - run on the GPU. + """The compute mode flag indicates whether individual or multiple compute applications may run on the GPU. Returns: Union[str, NaType] - :const:`'Default'`: means multiple contexts are allowed per device. @@ -1577,7 +1523,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_mode """ # pylint: disable=line-too-long - return { libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default', libnvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread', @@ -1597,7 +1542,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=compute_cap """ - if self._cuda_compute_capability is None: self._cuda_compute_capability = libnvml.nvmlQuery( 'nvmlDeviceGetCudaComputeCapability', self.handle @@ -1605,8 +1549,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return self._cuda_compute_capability def is_mig_device(self) -> bool: - """Returns whether or not the device is a MIG device.""" - + """Return whether or not the device is a MIG device.""" if self._is_mig_device is None: is_mig_device = libnvml.nvmlQuery( 'nvmlDeviceIsMigDeviceHandle', @@ -1632,7 +1575,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=mig.mode.current """ - if self.is_mig_device(): return NA @@ -1642,48 +1584,51 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return {0: 'Disabled', 1: 'Enabled'}.get(mig_mode, NA) def is_mig_mode_enabled(self) -> bool: - """Returns whether the MIG mode is enabled on the device. Returns :data:`False` if MIG mode is - disabled or the device does not support MIG mode. - """ + """Test whether the MIG mode is enabled on the device. + Return :data:`False` if MIG mode is disabled or the device does not support MIG mode. + """ return boolify(self.mig_mode()) def max_mig_device_count(self) -> int: - """Returns the maximum number of MIG instances the device supports. Returns 0 if the device - does not support MIG mode. - """ + """Return the maximum number of MIG instances the device supports. + This method will return 0 if the device does not support MIG mode. + """ return 0 # implemented in PhysicalDevice def mig_devices(self) -> List['MigDevice']: - """Returns a list of children MIG devices of the current device. Returns an empty list if - the MIG mode is disabled or the device does not support MIG mode. - """ + """Return a list of children MIG devices of the current device. + This method will return an empty list if the MIG mode is disabled or the device does not + support MIG mode. + """ return [] # implemented in PhysicalDevice def is_leaf_device(self) -> bool: - """Returns :data:`True` if the device is a physical device with MIG mode disabled or a MIG device. - Otherwise returns :data:`False` if the device is a physical device with MIG mode enabled. - """ + """Test whether the device is a physical device with MIG mode disabled or a MIG device. + Return :data:`True` if the device is a physical device with MIG mode disabled or a MIG device. + Otherwise, return :data:`False` if the device is a physical device with MIG mode enabled. + """ return self.is_mig_device() or not self.is_mig_mode_enabled() def to_leaf_devices(self) -> List[Union['PhysicalDevice', 'MigDevice', 'CudaDevice']]: - """Returns a list of leaf devices. Note that a CUDA device is always a leaf device.""" + """Return a list of leaf devices. + Note that a CUDA device is always a leaf device. + """ if isinstance(self, CudaDevice) or self.is_leaf_device(): return [self] return self.mig_devices() @ttl_cache(ttl=2.0) def processes(self) -> Dict[int, GpuProcess]: - """Returns a dictionary of processes running on the GPU. + """Return a dictionary of processes running on the GPU. Returns: Dict[int, GpuProcess] A dictionary mapping PID to GPU process instance. """ - processes = {} found_na = False @@ -1725,8 +1670,10 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me return processes def as_snapshot(self) -> Snapshot: - """Returns a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`.""" + """Return a onetime snapshot of the device. + The attributes are defined in :attr:`SNAPSHOT_KEYS`. + """ with self.oneshot(): return Snapshot( real=self, @@ -1777,8 +1724,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me # Modified from psutil (https://github.com/giampaolo/psutil) @contextlib.contextmanager def oneshot(self): - """Utility context manager which considerably speeds up the retrieval of multiple device - information at the same time. + """A utility context manager which considerably speeds up the retrieval of multiple device information at the same time. Internally different device info (e.g. memory_info, utilization_rates, ...) may be fetched by using the same routine, but only one information is returned and the others are discarded. @@ -1789,7 +1735,6 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me time you retrieve more than one information about the device. Examples: - >>> from nvitop import Device >>> device = Device(0) >>> with device.oneshot(): @@ -1797,8 +1742,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me ... device.memory_used() # return cached value ... device.memory_free_human() # return cached value ... device.memory_percent() # return cached value - """ - + """ # pylint: disable=line-too-long with self._lock: # pylint: disable=no-member if hasattr(self, '_cache'): @@ -1839,7 +1783,10 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me class PhysicalDevice(Device): - """Class for physical devices. This is the real GPU installed in the system.""" + """Class for physical devices. + + This is the real GPU installed in the system. + """ @property def physical_index(self) -> int: @@ -1851,37 +1798,36 @@ class PhysicalDevice(Device): nvidia-smi --id= --format=csv,noheader,nounits --query-gpu=index """ - return self._nvml_index @ttl_cache(ttl=60.0) def max_mig_device_count(self) -> int: - """Returns the maximum number of MIG instances the device supports. Returns 0 if the device - does not support MIG mode. - """ + """Return the maximum number of MIG instances the device supports. + This method will return 0 if the device does not support MIG mode. + """ return libnvml.nvmlQuery( 'nvmlDeviceGetMaxMigDeviceCount', self.handle, default=0, ignore_function_not_found=True ) @ttl_cache(ttl=60.0) def mig_device(self, mig_index: int) -> 'MigDevice': - """Returns a child MIG device of the given index. + """Return a child MIG device of the given index. Raises: libnvml.NVMLError: If the device does not support MIG mode or the given MIG device does not exist. """ - with _global_physical_device(self): return MigDevice(index=(self.index, mig_index)) @ttl_cache(ttl=60.0) def mig_devices(self) -> List['MigDevice']: - """Returns a list of children MIG devices of the current device. Returns an empty list if - the MIG mode is disabled or the device does not support MIG mode. - """ + """Return a list of children MIG devices of the current device. + This method will return an empty list if the MIG mode is disabled or the device does not + support MIG mode. + """ mig_devices = [] if self.is_mig_mode_enabled(): @@ -1903,14 +1849,12 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes @classmethod def count(cls) -> int: - """The number of total MIG devices. Aggregated over all physical devices.""" - + """The number of total MIG devices aggregated over all physical devices.""" return len(cls.all()) @classmethod def all(cls) -> List['MigDevice']: - """Returns a list of MIG devices. Aggregated over all physical devices.""" - + """Return a list of MIG devices aggregated over all physical devices.""" mig_devices = [] for device in PhysicalDevice.all(): mig_devices.extend(device.mig_devices()) @@ -1920,7 +1864,7 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes def from_indices( # pylint: disable=signature-differs cls, indices: Iterable[Tuple[int, int]] ) -> List['MigDevice']: - """Returns a list of MIG devices of the given indices. + """Return a list of MIG devices of the given indices. Args: indices (Iterable[Tuple[int, int]]): @@ -1940,14 +1884,13 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ - return list(map(cls, indices)) # pylint: disable-next=super-init-not-called def __init__( self, index: Optional[Union[Tuple[int, int], str]] = None, *, uuid: Optional[str] = None ) -> None: - """Initializes the instance created by :meth:`__new__()`. + """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: @@ -1960,7 +1903,6 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ - if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index @@ -2025,25 +1967,21 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes @property def index(self) -> Tuple[int, int]: """The index of the MIG device. This is a tuple of two integers.""" - return self._nvml_index @property def physical_index(self) -> int: """The index of the parent physical device.""" - return self._nvml_index[0] @property def mig_index(self) -> int: """The index of the MIG device over the all MIG devices of the parent device.""" - return self._nvml_index[1] @property def parent(self) -> PhysicalDevice: """The parent physical device.""" - return self._parent def gpu_instance_id(self) -> Union[int, NaType]: @@ -2052,7 +1990,6 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes Returns: Union[int, NaType] The gpu instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ - if self._gpu_instance_id is NA: self._gpu_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetGpuInstanceId', self.handle, default=0xFFFFFFFF @@ -2067,7 +2004,6 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes Returns: Union[int, NaType] The compute instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ - if self._compute_instance_id is NA: self._compute_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetComputeInstanceId', self.handle, default=0xFFFFFFFF @@ -2077,8 +2013,10 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes return self._compute_instance_id def as_snapshot(self) -> Snapshot: - """Returns a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`.""" + """Return a onetime snapshot of the device. + The attributes are defined in :attr:`SNAPSHOT_KEYS`. + """ snapshot = super().as_snapshot() snapshot.mig_index = self.mig_index @@ -2088,8 +2026,9 @@ class MigDevice(Device): # pylint: disable=too-many-instance-attributes class CudaDevice(Device): - """Class for devices enumerated over the CUDA ordinal. The order can be vary for different - ``CUDA_VISIBLE_DEVICES`` environment variable. + """Class for devices enumerated over the CUDA ordinal. + + The order can be vary for different ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables `_ @@ -2105,7 +2044,6 @@ class CudaDevice(Device): - (nvml_index: (int, int)) -> CudaMigDevice Examples: - >>> import os >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' @@ -2158,14 +2096,12 @@ class CudaDevice(Device): @classmethod def is_available(cls) -> bool: - """Returns whether there are any CUDA-capable devices available.""" - + """Test whether there are any CUDA-capable devices available.""" return cls.count() > 0 @classmethod def count(cls) -> int: """The number of GPUs visible to CUDA applications.""" - try: return len(super().parse_cuda_visible_devices()) except libnvml.NVMLError: @@ -2178,14 +2114,14 @@ class CudaDevice(Device): Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. """ - return cls.from_indices() @classmethod def from_indices( cls, indices: Optional[Union[int, Iterable[int]]] = None ) -> List['CudaDevice']: - """Returns a list of CUDA devices of the given CUDA indices. + """Return a list of CUDA devices of the given CUDA indices. + The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: @@ -2210,7 +2146,6 @@ class CudaDevice(Device): RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ - return super().from_cuda_indices(indices) def __new__( @@ -2220,7 +2155,9 @@ class CudaDevice(Device): nvml_index: Optional[Union[int, Tuple[int, int]]] = None, uuid: Optional[str] = None, ) -> 'Device': - """Creates a new instance of CudaDevice. The type of the result is determined by the given argument. + """Create a new instance of CudaDevice. + + The type of the result is determined by the given argument. .. code-block:: python @@ -2242,7 +2179,6 @@ class CudaDevice(Device): RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ - if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = cls.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): @@ -2261,7 +2197,7 @@ class CudaDevice(Device): nvml_index: Optional[Union[int, Tuple[int, int]]] = None, uuid: Optional[str] = None, ) -> None: - """Initializes the instance created by :meth:`__new__()`. + """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: @@ -2279,7 +2215,6 @@ class CudaDevice(Device): If the given device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ - if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = self.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): @@ -2295,6 +2230,7 @@ class CudaDevice(Device): self._ident = ((self._cuda_index, self.index), self.uuid()) def __str__(self) -> str: + """Return a string representation of the CUDA device.""" return '{}(cuda_index={}, nvml_index={}, name="{}", total_memory={})'.format( self.__class__.__name__, self.cuda_index, @@ -2306,11 +2242,14 @@ class CudaDevice(Device): __repr__ = __str__ def __reduce__(self) -> Tuple[Type['CudaDevice'], Tuple[int]]: + """Return state information for pickling.""" return self.__class__, (self._cuda_index,) def as_snapshot(self) -> Snapshot: - """Returns a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`.""" + """Return a onetime snapshot of the device. + The attributes are defined in :attr:`SNAPSHOT_KEYS`. + """ snapshot = super().as_snapshot() snapshot.cuda_index = self.cuda_index @@ -2326,8 +2265,7 @@ class CudaMigDevice(CudaDevice, MigDevice): def is_mig_device_uuid(uuid: Optional[str]) -> bool: - """Returns :data:`True` if the argument is a MIG device UUID, otherwise, returns :data:`False`.""" - + """Return :data:`True` if the argument is a MIG device UUID, otherwise, return :data:`False`.""" if isinstance(uuid, str): match = Device.UUID_PATTERN.match(uuid) if match is not None and match.group('MigMode') is not None: @@ -2338,8 +2276,9 @@ def is_mig_device_uuid(uuid: Optional[str]) -> bool: def parse_cuda_visible_devices( cuda_visible_devices: Optional[str] = _VALUE_OMITTED, ) -> Union[List[int], List[Tuple[int, int]]]: - """Parses the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. - Aliased by :meth:`Device.parse_cuda_visible_devices`. + """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. + + This function is aliased by :meth:`Device.parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. @@ -2359,7 +2298,6 @@ def parse_cuda_visible_devices( corresponding real device indices. Examples: - >>> import os >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> parse_cuda_visible_devices() # parse the `CUDA_VISIBLE_DEVICES` environment variable to NVML indices @@ -2388,7 +2326,6 @@ def parse_cuda_visible_devices( >>> parse_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) [] """ # pylint: disable=line-too-long - if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) @@ -2396,8 +2333,9 @@ def parse_cuda_visible_devices( def normalize_cuda_visible_devices(cuda_visible_devices: Optional[str] = _VALUE_OMITTED) -> str: - """Parses the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. - Aliased by :meth:`Device.normalize_cuda_visible_devices`. + """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. + + This function is aliased by :meth:`Device.normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. @@ -2416,7 +2354,6 @@ def normalize_cuda_visible_devices(cuda_visible_devices: Optional[str] = _VALUE_ The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable. Examples: - >>> import os >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> normalize_cuda_visible_devices() # normalize the `CUDA_VISIBLE_DEVICES` environment variable to UUID strings @@ -2445,7 +2382,6 @@ def normalize_cuda_visible_devices(cuda_visible_devices: Optional[str] = _VALUE_ >>> normalize_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) '' """ # pylint: disable=line-too-long - if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) @@ -2519,7 +2455,6 @@ def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-s format: str = 'index', # pylint: disable=redefined-builtin ) -> Union[List[int], List[Tuple[int, int]], List[str]]: """The underlining implementation for :meth:`parse_cuda_visible_devices`. The result will be cached.""" - assert format in ('index', 'uuid') try: @@ -2609,8 +2544,9 @@ def _parse_cuda_visible_devices_to_uuids( cuda_visible_devices: Optional[str] = _VALUE_OMITTED, verbose=True, ) -> List[str]: - """Parses the given ``CUDA_VISIBLE_DEVICES`` environment variable in a separate process and - returns a list of device UUIDs. The UUIDs do not have a prefix ``GPU-`` or ``MIG-``. + """Parse the given ``CUDA_VISIBLE_DEVICES`` environment variable in a separate process and return a list of device UUIDs. + + The UUIDs do not have a prefix ``GPU-`` or ``MIG-``. Args: cuda_visible_devices (Optional[str]): @@ -2626,8 +2562,7 @@ def _parse_cuda_visible_devices_to_uuids( If cannot found the CUDA driver libraries. libcuda.CUDAError: If failed to parse the ``CUDA_VISIBLE_DEVICES`` environment variable. - """ - + """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) diff --git a/nvitop/core/host.py b/nvitop/core/host.py index f175f1f..bd9f6cc 100644 --- a/nvitop/core/host.py +++ b/nvitop/core/host.py @@ -16,8 +16,8 @@ # ============================================================================== """Shortcuts for package ``psutil``. -psutil is a cross-platform library for retrieving information on running processes -and system utilization (CPU, memory, disks, network, sensors) in Python. +``psutil`` is a cross-platform library for retrieving information on running processes and system +utilization (CPU, memory, disks, network, sensors) in Python. """ import os as _os @@ -50,31 +50,30 @@ swap_memory = _ttl_cache(ttl=0.25)(_psutil.swap_memory) try: load_average = _ttl_cache(ttl=2.0)(_psutil.getloadavg) + load_average.__doc__ = """Get the system load average.""" except AttributeError: - def load_average(): # pylint: disable=missing-function-docstring + def load_average(): + """Get the system load average.""" return None def memory_percent(): - """The percentage usage of virtual memory, calculated as (total - available) / total * 100.""" - + """The percentage usage of virtual memory, calculated as ``(total - available) / total * 100``.""" return virtual_memory().percent def swap_percent(): - """The percentage usage of virtual memory, calculated as used / total * 100.""" - + """The percentage usage of virtual memory, calculated as ``used / total * 100``.""" return swap_memory().percent ppid_map = _psutil._ppid_map # pylint: disable=protected-access -"""Obtains a ``{pid: ppid, ...}`` dict for all running processes in one shot.""" +"""Obtain a ``{pid: ppid, ...}`` dict for all running processes in one shot.""" def reverse_ppid_map(): # pylint: disable=function-redefined - """Obtains a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot.""" - + """Obtain a ``{ppid: [pid, ...], ...}`` dict for all running processes in one shot.""" from collections import defaultdict # pylint: disable=import-outside-toplevel tree = defaultdict(list) diff --git a/nvitop/core/libcuda.py b/nvitop/core/libcuda.py index eb7d7ca..298d386 100644 --- a/nvitop/core/libcuda.py +++ b/nvitop/core/libcuda.py @@ -29,11 +29,11 @@ from typing import Type as _Type # pylint: disable-next=missing-class-docstring,too-few-public-methods -class struct_c_CUdevice_t(_ctypes.Structure): +class _struct_c_CUdevice_t(_ctypes.Structure): pass # opaque handle -c_CUdevice_t = _ctypes.POINTER(struct_c_CUdevice_t) +_c_CUdevice_t = _ctypes.POINTER(_struct_c_CUdevice_t) _CUresult_t = _ctypes.c_uint @@ -229,8 +229,7 @@ class CUDAError(Exception): _errcode_to_name = {} def __new__(cls, value: int) -> 'CUDAError': - """Maps value to a proper subclass of :class:`CUDAError`.""" - + """Map value to a proper subclass of :class:`CUDAError`.""" if cls is CUDAError: # pylint: disable-next=self-cls-assignment cls = CUDAError._value_class_mapping.get(value, cls) @@ -239,6 +238,7 @@ class CUDAError(Exception): return obj def __str__(self) -> str: + """Return a string representation of the error.""" # pylint: disable=no-member try: if self.value not in CUDAError._errcode_to_string: @@ -255,30 +255,32 @@ class CUDAError(Exception): except CUDAError: return f'CUDA Error with code {self.value}.' + __repr__ = __str__ + def __eq__(self, other: object) -> bool: + """Test equality to other object.""" if not isinstance(other, CUDAError): return NotImplemented return self.value == other.value # pylint: disable=no-member def __reduce__(self) -> _Tuple[_Type['CUDAError'], _Tuple[int]]: + """Return state information for pickling.""" return CUDAError, (self.value,) # pylint: disable=no-member def cudaExceptionClass(cudaErrorCode: int) -> _Type[CUDAError]: - """Maps value to a proper subclass of :class:`CUDAError`. + """Map value to a proper subclass of :class:`CUDAError`. Raises: ValueError: If the error code is not valid. """ - - # pylint: disable=protected-access - if cudaErrorCode not in CUDAError._value_class_mapping: + if cudaErrorCode not in CUDAError._value_class_mapping: # pylint: disable=protected-access raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.') - return CUDAError._value_class_mapping[cudaErrorCode] + return CUDAError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access def _extract_cuda_errors_as_classes() -> None: - """Generates a hierarchy of classes on top of :class:`CUDAError` class. + """Generate a hierarchy of classes on top of :class:`CUDAError` class. Each CUDA Error gets a new :class:`CUDAError` subclass. This way try-except blocks can filter appropriate exceptions more easily. @@ -286,7 +288,6 @@ def _extract_cuda_errors_as_classes() -> None: :class:`CUDAError` is a parent class. Each ``CUDA_ERROR_*`` gets it's own subclass. e.g. :data:`CUDA_ERROR_INVALID_VALUE` will be turned into :class:`CUDAError_InvalidValue`. """ - this_module = _sys.modules[__name__] cuda_error_names = [x for x in dir(this_module) if x.startswith('CUDA_ERROR_')] for err_name in cuda_error_names: @@ -339,8 +340,7 @@ __cudaGetFunctionPointer_cache = {} def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: - """ - Get the function pointer from the CUDA driver library. + """Get the function pointer from the CUDA driver library. Raises: CUDAError_NotInitialized: @@ -348,7 +348,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: CUDAError_NotFound: If cannot found the function pointer. """ - if name in __cudaGetFunctionPointer_cache: return __cudaGetFunctionPointer_cache[name] @@ -364,14 +363,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: def __LoadCudaLibrary() -> None: - """ - Load the library if it isn't loaded already. + """Load the library if it isn't loaded already. Raises: CUDAError_NotInitialized: If cannot found the CUDA driver library. """ - global __cudaLib # pylint: disable=global-statement if __cudaLib is None: @@ -409,7 +406,7 @@ def __LoadCudaLibrary() -> None: def cuInit(flags: int = 0) -> None: """Initialize the CUDA driver API. - Initializes the driver API and must be called before any other function from the driver API. + Initialize the driver API and must be called before any other function from the driver API. Currently, the ``flags`` parameter must be :data:`0`. If :func:`cuInit` has not been called, any function from the driver API will return :data:`CUDA_ERROR_NOT_INITIALIZED`. @@ -429,7 +426,6 @@ def cuInit(flags: int = 0) -> None: CUDAError_NotInitialized: If cannot found the CUDA driver library. """ - global __initialized # pylint: disable=global-statement __LoadCudaLibrary() @@ -447,7 +443,7 @@ def cuInit(flags: int = 0) -> None: def cuGetErrorName(error: int) -> str: - """Gets the string representation of an error code enum name. + """Get the string representation of an error code enum name. Raises: CUDAError_InvalidValue: @@ -455,7 +451,6 @@ def cuGetErrorName(error: int) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuGetErrorName') p_name = _ctypes.POINTER(_ctypes.c_char_p)() @@ -466,7 +461,7 @@ def cuGetErrorName(error: int) -> str: def cuGetErrorString(error: int) -> str: - """Gets the string description of an error code. + """Get the string description of an error code. Raises: CUDAError_InvalidValue: @@ -474,7 +469,6 @@ def cuGetErrorString(error: int) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuGetErrorString') p_name = _ctypes.POINTER(_ctypes.c_char_p)() @@ -485,7 +479,7 @@ def cuGetErrorString(error: int) -> str: def cuDriverGetVersion() -> str: - """Returns the latest CUDA version supported by driver. + """Get the latest CUDA version supported by driver. Returns: A string of the form :data:`'.'`. @@ -496,7 +490,6 @@ def cuDriverGetVersion() -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDriverGetVersion') driver_version = _ctypes.c_int() @@ -508,7 +501,7 @@ def cuDriverGetVersion() -> str: def cuDeviceGetCount() -> int: - """Returns the number of compute-capable devices. + """Get the number of compute-capable devices. Returns: int The number of devices with compute capability greater than or equal to 2.0 that are available @@ -524,7 +517,6 @@ def cuDeviceGetCount() -> int: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGetCount') count = _ctypes.c_int(0) @@ -533,8 +525,8 @@ def cuDeviceGetCount() -> int: return count.value -def cuDeviceGet(ordinal: int) -> c_CUdevice_t: - """Returns a handle to a compute device. +def cuDeviceGet(ordinal: int) -> _c_CUdevice_t: + """Get a handle to a compute device. Returns: A device handle given an ordinal in the range :code:`[0, ..., cuDeviceGetCount() - 1]`. @@ -552,20 +544,19 @@ def cuDeviceGet(ordinal: int) -> c_CUdevice_t: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGet') - device = c_CUdevice_t() + device = _c_CUdevice_t() ret = fn(_ctypes.byref(device), _ctypes.c_int(ordinal)) _cudaCheckReturn(ret) return device -def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t: - """Returns a handle to a compute device. +def cuDeviceGetByPCIBusId(pciBusId: str) -> _c_CUdevice_t: + """Get a handle to a compute device. Args: - pciBusId: str + pciBusId (str): String in one of the following forms: ``[domain]:[bus]:[device].[function]``, ``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``, ``device``, and ``function`` are all hexadecimal values. @@ -584,17 +575,16 @@ def cuDeviceGetByPCIBusId(pciBusId: str) -> c_CUdevice_t: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGetByPCIBusId') - device = c_CUdevice_t() + device = _c_CUdevice_t() ret = fn(_ctypes.byref(device), _ctypes.c_char_p(pciBusId.encode('UTF-8'))) _cudaCheckReturn(ret) return device -def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str: - """Returns a PCI Bus Id string for the device. +def cuDeviceGetPCIBusId(device: _c_CUdevice_t) -> str: + """Get a PCI Bus Id string for the device. Returns: str An identifier string for the device in the following format ``[domain]:[bus]:[device].[function]`` @@ -611,7 +601,6 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGetPCIBusId') pciBusId = _ctypes.create_string_buffer(256) @@ -620,8 +609,8 @@ def cuDeviceGetPCIBusId(device: c_CUdevice_t) -> str: return pciBusId.value.decode('UTF-8', errors='replace') -def cuDeviceGetName(device: c_CUdevice_t) -> str: - """Returns an identifier string for the device. +def cuDeviceGetName(device: _c_CUdevice_t) -> str: + """Get an identifier string for the device. Returns: str An ASCII string identifying the device. @@ -639,7 +628,6 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGetName') name = _ctypes.create_string_buffer(256) @@ -648,8 +636,8 @@ def cuDeviceGetName(device: c_CUdevice_t) -> str: return name.value.decode('UTF-8', errors='replace') -def cuDeviceGetUuid(device: c_CUdevice_t) -> str: - """Returns a UUID for the device. +def cuDeviceGetUuid(device: _c_CUdevice_t) -> str: + """Get a UUID for the device. Raises: CUDAError_InvalidDevice: @@ -662,7 +650,6 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - try: fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2') except AttributeError: @@ -676,8 +663,8 @@ def cuDeviceGetUuid(device: c_CUdevice_t) -> str: return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])) -def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str: - """Returns a UUID for the device (CUDA 11.4+). +def cuDeviceGetUuid_v2(device: _c_CUdevice_t) -> str: + """Get a UUID for the device (CUDA 11.4+). Raises: CUDAError_InvalidDevice: @@ -690,7 +677,6 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceGetUuid_v2') ubyte_array = _ctypes.c_ubyte * 16 @@ -701,8 +687,8 @@ def cuDeviceGetUuid_v2(device: c_CUdevice_t) -> str: return '-'.join((uuid[:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])) -def cuDeviceTotalMem(device: c_CUdevice_t) -> int: - """Returns the total amount of memory on the device (in bytes). +def cuDeviceTotalMem(device: _c_CUdevice_t) -> int: + """Get the total amount of memory on the device (in bytes). Raises: CUDAError_InvalidContext: @@ -717,7 +703,6 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int: CUDAError_NotInitialized: If the CUDA driver API is not initialized. """ - fn = __cudaGetFunctionPointer('cuDeviceTotalMem') bytes = _ctypes.c_size_t() # pylint: disable=redefined-builtin @@ -727,8 +712,7 @@ def cuDeviceTotalMem(device: c_CUdevice_t) -> int: def is_available() -> bool: - """Whether there are any CUDA visible devices.""" - + """Test whether there are any CUDA visible devices.""" try: return cuDeviceGetCount() > 0 except CUDAError: diff --git a/nvitop/core/libcudart.py b/nvitop/core/libcudart.py index f7e2bf0..1ac3fa9 100644 --- a/nvitop/core/libcudart.py +++ b/nvitop/core/libcudart.py @@ -281,8 +281,7 @@ class cudaError(Exception): _errcode_to_name = {} def __new__(cls, value: int) -> 'cudaError': - """Maps value to a proper subclass of :class:`cudaError`.""" - + """Map value to a proper subclass of :class:`cudaError`.""" if cls is cudaError: # pylint: disable-next=self-cls-assignment cls = cudaError._value_class_mapping.get(value, cls) @@ -291,6 +290,7 @@ class cudaError(Exception): return obj def __str__(self) -> str: + """Return a string representation of the error.""" # pylint: disable=no-member try: if self.value not in cudaError._errcode_to_string: @@ -307,30 +307,32 @@ class cudaError(Exception): except cudaError: return f'CUDA Error with code {self.value}.' + __repr__ = __str__ + def __eq__(self, other: object) -> bool: + """Test equality to other object.""" if not isinstance(other, cudaError): return NotImplemented return self.value == other.value # pylint: disable=no-member def __reduce__(self) -> _Tuple[_Type['cudaError'], _Tuple[int]]: + """Return state information for pickling.""" return cudaError, (self.value,) # pylint: disable=no-member def cudaExceptionClass(cudaErrorCode: int) -> _Type[cudaError]: - """Maps value to a proper subclass of :class:`cudaError`. + """Map value to a proper subclass of :class:`cudaError`. Raises: ValueError: If the error code is not valid. """ - - # pylint: disable=protected-access - if cudaErrorCode not in cudaError._value_class_mapping: + if cudaErrorCode not in cudaError._value_class_mapping: # pylint: disable=protected-access raise ValueError(f'cudaErrorCode {cudaErrorCode} is not valid.') - return cudaError._value_class_mapping[cudaErrorCode] + return cudaError._value_class_mapping[cudaErrorCode] # pylint: disable=protected-access def _extract_cuda_errors_as_classes() -> None: - """Generates a hierarchy of classes on top of :class:`cudaError` class. + """Generate a hierarchy of classes on top of :class:`cudaError` class. Each CUDA Error gets a new :class:`cudaError` subclass. This way try-except blocks can filter appropriate exceptions more easily. @@ -338,7 +340,6 @@ def _extract_cuda_errors_as_classes() -> None: :class:`cudaError` is a parent class. Each ``cudaError*`` gets it's own subclass. e.g. :data:`cudaErrorInvalidValue` will be turned into :class:`cudaError_InvalidValue`. """ - this_module = _sys.modules[__name__] cuda_error_names = [ x @@ -393,8 +394,7 @@ __cudaGetFunctionPointer_cache = {} def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: - """ - Get the function pointer from the CUDA Runtime library. + """Get the function pointer from the CUDA Runtime library. Raises: cudaError_InitializationError: @@ -402,7 +402,6 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: cudaError_SymbolNotFound: If cannot found the function pointer. """ - if name in __cudaGetFunctionPointer_cache: return __cudaGetFunctionPointer_cache[name] @@ -418,14 +417,12 @@ def __cudaGetFunctionPointer(name: str) -> _ctypes._CFuncPtr: def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches - """ - Load the library if it isn't loaded already. + """Load the library if it isn't loaded already. Raises: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ - global __cudaLib # pylint: disable=global-statement if __cudaLib is None: @@ -498,7 +495,7 @@ def __LoadCudaLibrary() -> None: # pylint: disable=too-many-branches def cudaGetErrorName(error: int) -> str: - """Returns the string representation of an error code enum name. + """Get the string representation of an error code enum name. Returns: str A string containing the name of an error code in the enum. If the error code is not @@ -508,7 +505,6 @@ def cudaGetErrorName(error: int) -> str: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ - fn = __cudaGetFunctionPointer('cudaGetErrorName') fn.restype = _ctypes.c_char_p # otherwise return is an int @@ -518,7 +514,7 @@ def cudaGetErrorName(error: int) -> str: def cuGetErrorString(error: int) -> str: - """Returns the description string for an error code. + """Get the description string for an error code. Returns: str The description string for an error code. If the error code is not recognized, "unrecognized @@ -528,7 +524,6 @@ def cuGetErrorString(error: int) -> str: cudaError_InitializationError: If cannot found the CUDA Runtime library. """ - fn = __cudaGetFunctionPointer('cudaGetErrorString') fn.restype = _ctypes.c_char_p # otherwise return is an int @@ -538,7 +533,7 @@ def cuGetErrorString(error: int) -> str: def cudaGetLastError() -> int: - """Returns the last error from a runtime call. + """Get the last error from a runtime call. Returns: int The last error that has been produced by any of the runtime calls in the same instance of @@ -552,13 +547,12 @@ def cudaGetLastError() -> int: cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ - fn = __cudaGetFunctionPointer('cudaGetLastError') return fn() def cudaPeekAtLastError() -> int: - """Returns the last error from a runtime call. + """Get the last error from a runtime call. Returns: int The last error that has been produced by any of the runtime calls in the same instance of @@ -573,13 +567,12 @@ def cudaPeekAtLastError() -> int: cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ - fn = __cudaGetFunctionPointer('cudaPeekAtLastError') return fn() def cudaDriverGetVersion() -> str: - """Returns the latest CUDA version supported by driver. + """Get the latest CUDA version supported by driver. Returns: str The latest version of CUDA supported by the driver of the form :data:`'.'`. @@ -592,7 +585,6 @@ def cudaDriverGetVersion() -> str: cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ - fn = __cudaGetFunctionPointer('cudaDriverGetVersion') driver_version = _ctypes.c_int() @@ -604,7 +596,7 @@ def cudaDriverGetVersion() -> str: def cudaRuntimeGetVersion() -> str: - """Returns the CUDA Runtime version. + """Get the CUDA Runtime version. Returns: str The version number of the current CUDA Runtime instance of the form :data:`'.'`. @@ -617,7 +609,6 @@ def cudaRuntimeGetVersion() -> str: cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ - fn = __cudaGetFunctionPointer('cudaRuntimeGetVersion') runtime_version = _ctypes.c_int() @@ -629,7 +620,7 @@ def cudaRuntimeGetVersion() -> str: def cudaGetDeviceCount() -> int: - """Returns the number of compute-capable devices. + """Get the number of compute-capable devices. Returns: int The number of devices with compute capability greater or equal to 2.0 that are available for @@ -643,7 +634,6 @@ def cudaGetDeviceCount() -> int: cudaError_NoDevice: If no CUDA-capable devices were detected by the installed CUDA driver. """ - fn = __cudaGetFunctionPointer('cudaGetDeviceCount') count = _ctypes.c_int(0) @@ -653,10 +643,10 @@ def cudaGetDeviceCount() -> int: def cudaDeviceGetByPCIBusId(pciBusId: str) -> int: - """Returns a handle to a compute device. + """Get a handle to a compute device. Args: - pciBusId: str + pciBusId (str): String in one of the following forms: ``[domain]:[bus]:[device].[function]``, ``[domain]:[bus]:[device]``, ``[bus]:[device].[function]`` where ``domain``, ``bus``, ``device``, and ``function`` are all hexadecimal values. @@ -676,7 +666,6 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int: cudaError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device. """ - fn = __cudaGetFunctionPointer('cudaDeviceGetByPCIBusId') device = _ctypes.c_int() @@ -686,7 +675,7 @@ def cudaDeviceGetByPCIBusId(pciBusId: str) -> int: def cudaDeviceGetPCIBusId(device: int) -> str: - """Returns a PCI Bus Id string for the device. + """Get a PCI Bus Id string for the device. Returns: str An ASCII string identifying the device. @@ -703,7 +692,6 @@ def cudaDeviceGetPCIBusId(device: int) -> str: cudaError_InvalidDevice: If the device ordinal supplied by the user does not correspond to a valid CUDA device. """ - fn = __cudaGetFunctionPointer('cudaDeviceGetPCIBusId') pciBusId = _ctypes.create_string_buffer(256) @@ -713,8 +701,7 @@ def cudaDeviceGetPCIBusId(device: int) -> str: def is_available() -> bool: - """Whether there are any CUDA visible devices.""" - + """Test whether there are any CUDA visible devices.""" try: return cudaGetDeviceCount() > 0 except cudaError: diff --git a/nvitop/core/libnvml.py b/nvitop/core/libnvml.py index bff89cd..28b8e3f 100644 --- a/nvitop/core/libnvml.py +++ b/nvitop/core/libnvml.py @@ -67,9 +67,9 @@ if not callable(getattr(_pynvml, 'nvmlInitWithFlags', None)): NVMLError = _pynvml.NVMLError NVMLError.__doc__ = """Base exception class for NVML query errors.""" -NVMLError.__new__.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`.""" +NVMLError.__new__.__doc__ = """Map value to a proper subclass of :class:`NVMLError`.""" nvmlExceptionClass = _pynvml.nvmlExceptionClass -nvmlExceptionClass.__doc__ = """Maps value to a proper subclass of :class:`NVMLError`.""" +nvmlExceptionClass.__doc__ = """Map value to a proper subclass of :class:`NVMLError`.""" # Load members from module `pynvml` and register them in `__all__` and globals. _vars_pynvml = vars(_pynvml) @@ -143,7 +143,7 @@ Functions and Exceptions .. function:: __exit__(*args, **kwargs) -> None - Shutdowns the NVML context in the context manager for ``with`` statement. + Shutdown the NVML context in the context manager for ``with`` statement. """.format('\n\n'.join(_data_docs)) # fmt: skip @@ -203,7 +203,7 @@ VERSIONED_PATTERN = _re.compile(r'^(?P\w+)(?P_v(\d)+)$') def _lazy_init() -> None: - """Lazily initializes the NVML context. + """Lazily initialize the NVML context. Raises: NVMLError_LibraryNotFound: @@ -217,7 +217,6 @@ def _lazy_init() -> None: If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ - with __lock: if __initialized: return @@ -225,7 +224,7 @@ def _lazy_init() -> None: def nvmlInit() -> None: # pylint: disable=function-redefined - """Initializes the NVML context with default flag (0). + """Initialize the NVML context with default flag (0). Raises: NVMLError_LibraryNotFound: @@ -239,12 +238,11 @@ def nvmlInit() -> None: # pylint: disable=function-redefined If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ - nvmlInitWithFlags(0) def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined - """Initializes the NVML context with the given flags. + """Initialize the NVML context with the given flags. Raises: NVMLError_LibraryNotFound: @@ -258,7 +256,6 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module is overridden by other modules. Need to reinstall package ``nvidia-ml-py``. """ - global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned with __lock: @@ -312,7 +309,7 @@ def nvmlInitWithFlags(flags: int) -> None: # pylint: disable=function-redefined def nvmlShutdown() -> None: # pylint: disable=function-redefined - """Shutdowns the NVML context. + """Shutdown the NVML context. Raises: NVMLError_LibraryNotFound: @@ -325,7 +322,6 @@ def nvmlShutdown() -> None: # pylint: disable=function-redefined NVMLError_Uninitialized: If NVML was not first initialized with :func:`nvmlInit`. """ - global __flags, __initialized # pylint: disable=global-statement,global-variable-not-assigned _pynvml.nvmlShutdown() @@ -345,8 +341,9 @@ def nvmlQuery( ignore_function_not_found: bool = False, **kwargs, ) -> _Any: - """Calls a function with the given arguments from NVML. The NVML context will be automatically - initialized. + """Call a function with the given arguments from NVML. + + The NVML context will be automatically initialized. Args: func (Union[Callable[..., Any], str]): @@ -380,7 +377,6 @@ def nvmlQuery( NVMLError_InvalidArgument: If passed with an invalid argument. """ - global UNKNOWN_FUNCTIONS # pylint: disable=global-statement,global-variable-not-assigned _lazy_init() @@ -429,8 +425,7 @@ def nvmlQuery( def nvmlCheckReturn( retval: _Any, types: _Optional[_Union[_Type, _Tuple[_Type, ...]]] = None ) -> bool: - """Checks the return value is not :const:`nvitop.NA` and is one of the given types.""" - + """Check whether the return value is not :const:`nvitop.NA` and is one of the given types.""" if types is None: return retval != NA return retval != NA and isinstance(retval, types) @@ -474,8 +469,6 @@ def __patch_backward_compatibility_layers() -> None: ) def patch_function_pointers_when_fail(names, callback): - """Patches the function pointers of the NVML library.""" - def wrapper(nvmlGetFunctionPointer): @_functools.wraps(nvmlGetFunctionPointer) def wrapped(name): @@ -586,7 +579,7 @@ _driver_get_memory_info_v2_available = None if not _pynvml_installation_corrupte def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-many-branches - """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. + """Retrieve the amount of used, free, reserved and total memory available on the device, in bytes. Note: - The version 2 API adds additional memory information. The reserved amount is supported on @@ -607,7 +600,6 @@ def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-m NVMLError_Unknown: On any unexpected error. """ - global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement _lazy_init() @@ -702,8 +694,7 @@ class _CustomModule(_ModuleType): """ def __getattribute__(self, name: str) -> _Union[_Any, _Callable[..., _Any]]: - """Gets a member from the current module. Fallback to the original package if missing.""" - + """Get a member from the current module. Fallback to the original package if missing.""" try: return super().__getattribute__(name) except AttributeError: @@ -711,18 +702,15 @@ class _CustomModule(_ModuleType): def __enter__(self) -> '_CustomModule': """Entry of the context manager for ``with`` statement.""" - _lazy_init() return self def __exit__(self, *args, **kwargs) -> None: - """Shutdowns the NVML context in the context manager for ``with`` statement.""" - + """Shutdown the NVML context in the context manager for ``with`` statement.""" self.__del__() def __del__(self) -> None: - """Automatically shutdowns the NVML context on destruction.""" - + """Automatically shutdown the NVML context on destruction.""" try: nvmlShutdown() except NVMLError: diff --git a/nvitop/core/process.py b/nvitop/core/process.py index 0389264..c7fcd69 100644 --- a/nvitop/core/process.py +++ b/nvitop/core/process.py @@ -49,8 +49,7 @@ __all__ = ['HostProcess', 'GpuProcess', 'command_join'] if host.POSIX: def add_quotes(s: str) -> str: - """Returns a shell-escaped version of the string.""" - + """Return a shell-escaped version of the string.""" if s == '': return '""' if '$' not in s and '\\' not in s and '\n' not in s: @@ -67,8 +66,7 @@ if host.POSIX: elif host.WINDOWS: def add_quotes(s: str) -> str: - """Returns a shell-escaped version of the string.""" - + """Return a shell-escaped version of the string.""" if s == '': return '""' if '%' not in s and '^' not in s and '\n' not in s: @@ -83,14 +81,12 @@ elif host.WINDOWS: else: def add_quotes(s: str) -> str: - """Returns a shell-escaped version of the string.""" - + """Return a shell-escaped version of the string.""" return '"{}"'.format(s.replace('\n', r'\n')) def command_join(cmdline: List[str]) -> str: - """Returns a shell-escaped string from command line arguments.""" - + """Return a shell-escaped string from command line arguments.""" if len(cmdline) == 1 and not ( # May be modified by `setproctitle` os.path.isfile(cmdline[0]) @@ -105,10 +101,10 @@ _USE_FALLBACK_WHEN_RAISE = threading.local() # see also `GpuProcess.failsafe` def auto_garbage_clean(fallback=_RAISE): - """Removes the object references in the instance cache if the method call fails (the process is gone). + """Remove the object references in the instance cache if the method call fails (the process is gone). - The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise raises an - exception when falls. + The fallback value will be used with `:meth:`GpuProcess.failsafe`` context manager, otherwise + raises an exception when falls. """ def wrapper(func: Callable[..., Any]) -> Callable[..., Any]: @@ -143,12 +139,12 @@ def auto_garbage_clean(fallback=_RAISE): class HostProcess(host.Process, metaclass=ABCMeta): - """Represents an OS process with the given PID. - If PID is omitted current process PID (:func:`os.getpid`) is used. - The instance will be cache during the lifetime of the process. + """Represent an OS process with the given PID. + + If PID is omitted current process PID (:func:`os.getpid`) is used. The instance will be cache + during the lifetime of the process. Examples: - >>> HostProcess() # the current process HostProcess(pid=12345, name='python3', status='running', started='00:55:43') @@ -186,8 +182,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): INSTANCES = WeakValueDictionary() def __new__(cls, pid: Optional[int] = None) -> 'HostProcess': - """Returns the cached instance of :class:`HostProcess`.""" - + """Return the cached instance of :class:`HostProcess`.""" if pid is None: pid = os.getpid() @@ -215,7 +210,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): # pylint: disable-next=unused-argument,super-init-not-called def __init__(self, pid: Optional[int] = None) -> None: - pass + """Initialize the instance.""" @property def _gone(self) -> bool: @@ -232,17 +227,20 @@ class HostProcess(host.Process, metaclass=ABCMeta): self._super_gone = value def __str__(self) -> str: + """Return a string representation of the process.""" return super().__str__().replace(self.__class__.__module__ + '.', '', 1) __repr__ = __str__ def __reduce__(self) -> Tuple[Type['HostProcess'], Tuple[int]]: + """Return state information for pickling.""" return self.__class__, (self.pid,) if host.WINDOWS: def username(self) -> str: """The name of the user that owns the process. + On Windows, the domain name will be removed if it is present. Raises: @@ -251,7 +249,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - if self._username is None: # pylint: disable=access-member-before-definition self._username = ( # pylint: disable=attribute-defined-outside-init super().username().split('\\')[-1] @@ -262,6 +259,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): def username(self) -> str: """The name of the user that owns the process. + On UNIX this is calculated by using *real* process uid. Raises: @@ -270,7 +268,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - if self._username is None: # pylint: disable=access-member-before-definition self._username = ( # pylint: disable=attribute-defined-outside-init super().username() @@ -287,14 +284,13 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - cmdline = super().cmdline() if len(cmdline) > 1: cmdline = '\0'.join(cmdline).rstrip('\0').split('\0') return cmdline def command(self) -> str: - """Returns a shell-escaped string from command line arguments. + """Return a shell-escaped string from command line arguments. Raises: host.NoSuchProcess: @@ -302,7 +298,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return command_join(self.cmdline()) @memoize_when_activated @@ -315,7 +310,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return datetime.datetime.now() - datetime.datetime.fromtimestamp(self.create_time()) def running_time_human(self) -> str: @@ -327,7 +321,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return timedelta2human(self.running_time()) def running_time_in_seconds(self) -> float: # in seconds @@ -339,7 +332,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return self.running_time().total_seconds() elapsed_time = running_time @@ -355,11 +347,10 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return self.memory_info().rss def parent(self) -> Union['HostProcess', None]: - """Returns the parent process as a :class:`HostProcess` instance. Returns :data:`None` if there is no parent. + """Return the parent process as a :class:`HostProcess` instance or :data:`None` if there is no parent. Raises: host.NoSuchProcess: @@ -367,7 +358,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - parent = super().parent() if parent is not None: return HostProcess(parent.pid) @@ -375,6 +365,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): def children(self, recursive: bool = False) -> List['HostProcess']: """Return the children of this process as a list of :class:`HostProcess` instances. + If *recursive* is :data:`True` return all the descendants. Raises: @@ -383,13 +374,11 @@ class HostProcess(host.Process, metaclass=ABCMeta): host.AccessDenied: If the user do not have read privilege to the process' status file. """ - return [HostProcess(child.pid) for child in super().children(recursive)] @contextlib.contextmanager def oneshot(self): - """Utility context manager which considerably speeds up the retrieval of multiple process - information at the same time. + """A utility context manager which considerably speeds up the retrieval of multiple process information at the same time. Internally different process info (e.g. name, ppid, uids, gids, ...) may be fetched by using the same routine, but only one information is returned and the others are discarded. When @@ -400,7 +389,6 @@ class HostProcess(host.Process, metaclass=ABCMeta): time you retrieve more than one information about the process. Examples: - >>> from nvitop import HostProcess >>> p = HostProcess() >>> with p.oneshot(): @@ -408,8 +396,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): ... p.cpu_times() # return cached value ... p.cpu_percent() # return cached value ... p.create_time() # return cached value - """ - + """ # pylint: disable=line-too-long with self._lock: if hasattr(self, '_cache'): yield @@ -427,8 +414,7 @@ class HostProcess(host.Process, metaclass=ABCMeta): def as_snapshot( self, attrs: Optional[Iterable[str]] = None, ad_value: Optional[Any] = None ) -> Snapshot: - """Returns a onetime snapshot of the process.""" - + """Return a onetime snapshot of the process.""" with self.oneshot(): attributes = self.as_dict(attrs=attrs, ad_value=ad_value) @@ -444,11 +430,12 @@ class HostProcess(host.Process, metaclass=ABCMeta): @HostProcess.register class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-public-methods - """Represents a process with the given PID running on the given GPU device. + """Represent a process with the given PID running on the given GPU device. + The instance will be cache during the lifetime of the process. - The same host process can use multiple GPU devices. The :class:`GpuProcess` instances representing the - same PID on the host but different GPU devices are different. + The same host process can use multiple GPU devices. The :class:`GpuProcess` instances + representing the same PID on the host but different GPU devices are different. """ INSTANCE_LOCK = threading.RLock() @@ -466,8 +453,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin # pylint: enable=unused-argument ) -> 'GpuProcess': - """Returns the cached instance of :class:`GpuProcess`.""" - + """Return the cached instance of :class:`GpuProcess`.""" if pid is None: pid = os.getpid() @@ -503,8 +489,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi compute_instance_id: Optional[Union[int, NaType]] = None, type: Optional[Union[str, NaType]] = None, # pylint: disable=redefined-builtin ) -> None: - """Initializes the instance returned by :meth:`__new__()`.""" - + """Initialize the instance returned by :meth:`__new__()`.""" if gpu_memory is None and not hasattr(self, '_gpu_memory'): gpu_memory = NA if gpu_memory is not None: @@ -531,6 +516,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi setattr(self, f'_gpu_{util}_utilization', NA) def __str__(self) -> str: + """Return a string representation of the GPU process.""" return '{}(pid={}, gpu_memory={}, type={}, device={}, host={})'.format( self.__class__.__name__, self.pid, @@ -543,20 +529,19 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi __repr__ = __str__ def __eq__(self, other: object) -> bool: + """Test equality to other object.""" if not isinstance(other, (GpuProcess, host.Process)): return NotImplemented return self._ident == other._ident - def __ne__(self, other: object) -> bool: - return not self == other - def __hash__(self) -> int: + """Return a hash value of the GPU process.""" if self._hash is None: # pylint: disable=access-member-before-definition self._hash = hash(self._ident) # pylint: disable=attribute-defined-outside-init return self._hash def __getattr__(self, name: str) -> Union[Any, Callable[..., Any]]: - """Gets a member from the instance. Fallback to the host process instance if missing. + """Get a member from the instance or fallback to the host process instance if missing. Raises: AttributeError: @@ -566,7 +551,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi host.AccessDenied: If the user do not have read privilege to the process' status file. """ - try: return super().__getattr__(name) except AttributeError: @@ -582,74 +566,60 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi @property def pid(self) -> int: """The process PID.""" - return self._pid @property def host(self) -> HostProcess: """The process instance running on the host.""" - return self._host @property def device(self) -> 'Device': """The GPU device the process running on. - The same host process can use multiple GPU devices. - The :class:`GpuProcess` instances representing the same PID on the host - but different GPU devices are different. + The same host process can use multiple GPU devices. The :class:`GpuProcess` instances + representing the same PID on the host but different GPU devices are different. """ - return self._device def gpu_instance_id(self) -> Union[int, NaType]: """The GPU instance ID of the MIG device, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_instance_id def compute_instance_id(self) -> Union[int, NaType]: """The compute instance ID of the MIG device, or :const:`nvitop.NA` if not applicable.""" - return self._compute_instance_id def gpu_memory(self) -> Union[int, NaType]: # in bytes """The used GPU memory in bytes, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_memory def gpu_memory_human(self) -> Union[str, NaType]: # in human readable """The used GPU memory in human readable format, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_memory_human def gpu_memory_percent(self) -> Union[float, NaType]: # in percentage """The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_memory_percent def gpu_sm_utilization(self) -> Union[int, NaType]: # in percentage """The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable.""" - return self._gpu_sm_utilization def gpu_memory_utilization(self) -> Union[int, NaType]: # in percentage """The utilization rate of GPU memory bandwidth, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_memory_utilization def gpu_encoder_utilization(self) -> Union[int, NaType]: # in percentage """The utilization rate of the encoder, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_encoder_utilization def gpu_decoder_utilization(self) -> Union[int, NaType]: # in percentage """The utilization rate of the decoder, or :const:`nvitop.NA` if not applicable.""" - return self._gpu_decoder_utilization def set_gpu_memory(self, value: Union[int, NaType]) -> None: - """Sets the used GPU memory in bytes.""" - + """Set the used GPU memory in bytes.""" # pylint: disable=attribute-defined-outside-init self._gpu_memory = memory_used = value self._gpu_memory_human = bytes2human(self.gpu_memory()) @@ -666,8 +636,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi gpu_encoder_utilization: Optional[int] = None, gpu_decoder_utilization: Optional[int] = None, ) -> None: - """Sets the GPU utilization rates.""" - + """Set the GPU utilization rates.""" # pylint: disable=attribute-defined-outside-init if gpu_sm_utilization is not None: self._gpu_sm_utilization = gpu_sm_utilization @@ -679,8 +648,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi self._gpu_decoder_utilization = gpu_decoder_utilization def update_gpu_status(self) -> Union[int, NaType]: - """Updates the GPU consumption status from a new NVML query.""" - + """Update the GPU consumption status from a new NVML query.""" self.set_gpu_memory(NA) self.set_gpu_utilization(NA, NA, NA, NA) self.device.processes.cache_clear() @@ -697,7 +665,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi - :data:`'C+G'`: both compute context and graphics context - :data:`'N/A'`: not applicable """ - return self._type @type.setter @@ -713,8 +680,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi @auto_garbage_clean(fallback=False) def is_running(self) -> bool: - """Returns whether this process is running.""" - + """Return whether this process is running.""" return self.host.is_running() @auto_garbage_clean(fallback='terminated') @@ -731,7 +697,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.status() @auto_garbage_clean(fallback=NA) @@ -748,7 +713,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.create_time() @auto_garbage_clean(fallback=NA) @@ -765,7 +729,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.running_time() def running_time_human(self) -> Union[str, NaType]: @@ -781,7 +744,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return timedelta2human(self.running_time()) def running_time_in_seconds(self) -> Union[float, NaType]: @@ -797,7 +759,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - running_time = self.running_time() if running_time is NA: return NA @@ -821,7 +782,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - if self._username is None: # pylint: disable=access-member-before-definition self._username = self.host.username() # pylint: disable=attribute-defined-outside-init return self._username @@ -840,12 +800,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.name() @auto_garbage_clean(fallback=NA) def cpu_percent(self) -> Union[float, NaType]: # in percentage - """Returns a float representing the current process CPU utilization as a percentage. + """Return a float representing the current process CPU utilization as a percentage. Raises: host.NoSuchProcess: @@ -857,13 +816,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.cpu_percent() @auto_garbage_clean(fallback=NA) def memory_percent(self) -> Union[float, NaType]: # in percentage - """Compares process RSS memory to total physical system memory - and calculate process memory utilization as a percentage. + """Compare process RSS memory to total physical system memory and calculate process memory utilization as a percentage. Raises: host.NoSuchProcess: @@ -874,8 +831,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi Note: To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. - """ - + """ # pylint: disable=line-too-long return self.host.memory_percent() host_memory_percent = memory_percent # in percentage @@ -894,7 +850,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return self.host.rss_memory() def host_memory_human(self) -> Union[str, NaType]: @@ -910,7 +865,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return bytes2human(self.host_memory()) rss_memory = host_memory # in bytes @@ -930,14 +884,13 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - cmdline = self.host.cmdline() if len(cmdline) == 0 and not self._gone: cmdline = ['Zombie Process'] return cmdline def command(self) -> str: - """Returns a shell-escaped string from command line arguments. + """Return a shell-escaped string from command line arguments. Raises: host.NoSuchProcess: @@ -949,13 +902,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi To return the fallback value rather than raise an exception, please use the context manager :meth:`GpuProcess.failsafe`. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - return command_join(self.cmdline()) @auto_garbage_clean(fallback=_RAISE) def host_snapshot(self) -> Snapshot: - """Returns a onetime snapshot of the host process.""" - + """Return a onetime snapshot of the host process.""" with self.host.oneshot(): host_snapshot = Snapshot( real=self.host, @@ -980,7 +931,7 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi def as_snapshot( self, *, host_process_snapshot_cache: Optional[Dict[int, Snapshot]] = None ) -> Snapshot: - """Returns a onetime snapshot of the process on the GPU device. + """Return a onetime snapshot of the process on the GPU device. Note: To return the fallback value rather than raise an exception, please use the context @@ -988,7 +939,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi snapshots with :meth:`GpuProcess.take_snapshots`, which caches the results and reduces redundant queries. See also :meth:`take_snapshots` and :meth:`failsafe`. """ - host_process_snapshot_cache = host_process_snapshot_cache or {} try: host_snapshot = host_process_snapshot_cache[self.pid] @@ -1031,12 +981,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi def take_snapshots( # batched version of `as_snapshot` cls, gpu_processes: Iterable['GpuProcess'], *, failsafe=False ) -> List[Snapshot]: - """Takes snapshots for a list of :class:`GpuProcess` instances. + """Take snapshots for a list of :class:`GpuProcess` instances. If *failsafe* is :data:`True`, then if any method fails, the fallback value in :func:`auto_garbage_clean` will be used. """ - cache = {} context = cls.failsafe if failsafe else contextlib.nullcontext with context(): @@ -1052,7 +1001,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi """A context manager that enables fallback values for methods that fail. Examples: - >>> p = GpuProcess(pid=10000, device=Device(0)) # process does not exist >>> p GpuProcess(pid=10000, gpu_memory=N/A, type=N/A, device=PhysicalDevice(index=0, name="NVIDIA GeForce RTX 3070", total_memory=8192MiB), host=HostProcess(pid=10000, status='terminated')) @@ -1070,7 +1018,6 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi fallback (float cast): nan fallback (int cast): 0 """ # pylint: disable=line-too-long - global _USE_FALLBACK_WHEN_RAISE # pylint: disable=global-statement,global-variable-not-assigned prev_value = getattr(_USE_FALLBACK_WHEN_RAISE, 'value', False) diff --git a/nvitop/core/utils.py b/nvitop/core/utils.py index cc73439..c039b21 100644 --- a/nvitop/core/utils.py +++ b/nvitop/core/utils.py @@ -76,8 +76,7 @@ COLOR = sys.stdout.isatty() def set_color(value: bool) -> None: - """Force enables text coloring.""" - + """Force enable text coloring.""" global COLOR # pylint: disable=global-statement COLOR = bool(value) @@ -88,7 +87,7 @@ def colored( on_color: Optional[str] = None, attrs: Iterable[str] = None, ) -> str: - """Colorizes text. + """Colorize text with ANSI color escape codes. Available text colors: red, green, yellow, blue, magenta, cyan, white. @@ -100,11 +99,9 @@ def colored( bold, dark, underline, blink, reverse, concealed. Examples: - >>> colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink']) >>> colored('Hello, World!', 'green') """ - if COLOR: return _colored(text, color=color, on_color=on_color, attrs=attrs) return text @@ -114,11 +111,10 @@ class NaType(str): """A singleton (:const:`str: 'N/A'`) class represents a not applicable value. The :const:`NA` instance behaves like a :class:`str` instance (:const:`'N/A'`) when doing string - manipulation (e.g. concatenation). For arithmetic operations, for example :code:`NA / 1024 / 1024`, + manipulation (e.g. concatenation). For arithmetic operations, for example ``NA / 1024 / 1024``, it acts like the :data:`math.nan`. Examples: - >>> NA 'N/A' @@ -142,45 +138,42 @@ class NaType(str): """ def __new__(cls) -> 'NaType': - """Gets the singleton instance (:const:`nvitop.NA`).""" - + """Get the singleton instance (:const:`nvitop.NA`).""" if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls, 'N/A') return cls._instance def __bool__(self) -> bool: - """Converts :const:`NA` to :class:`bool`. Returns :data:`False`. + """Convert :const:`NA` to :class:`bool` and return :data:`False`. >>> bool(NA) False """ - return False def __int__(self) -> int: - """Converts :const:`NA` to :class:`int`. Returns :const:`0`. + """Convert :const:`NA` to :class:`int` and return :const:`0`. >>> int(NA) 0 """ - return 0 def __float__(self) -> float: - """Converts :const:`NA` to :class:`float`. Returns :data:`math.nan`. + """Convert :const:`NA` to :class:`float` and return :data:`math.nan`. >>> float(NA) nan >>> float(NA) is math.nan True """ - return math.nan def __add__(self, other: object) -> Union[str, float]: - """:const:`nvitop.NA` + other: Returns :data:`math.nan` if the operand is a number or uses - string concatenation if the operand is a string. A special case is when the operand is - :const:`nvitop.NA` itself, the result is :data:`math.nan` instead of :const:`'N/AN/A'`. + """Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``NA + other``). + + A special case is when the operand is :const:`nvitop.NA` itself, the result is + :data:`math.nan` instead of :const:`'N/AN/A'`. >>> NA + ' str' 'N/A str' @@ -190,14 +183,13 @@ class NaType(str): nan >>> NA + 1.0 nan - """ - + """ # pylint: disable=line-too-long if isinstance(other, (int, float)) or other is NA: return float(self) + other return super().__add__(other) def __radd__(self, other: object) -> Union[str, float]: - """other + :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number or uses string concatenation if the operand is a string (``other + NA``). >>> 'str' + NA 'strN/A' @@ -205,14 +197,13 @@ class NaType(str): nan >>> 1.0 + NA nan - """ - + """ # pylint: disable=line-too-long if isinstance(other, (int, float)): return other + float(self) return NotImplemented def __sub__(self, other: object) -> float: - """:const:`nvitop.NA` - other: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``NA - other``). >>> NA - 'str' TypeError: unsupported operand type(s) for -: 'NaType' and 'str' @@ -223,13 +214,12 @@ class NaType(str): >>> NA + 1.0 nan """ - if isinstance(other, (int, float)) or other is NA: return float(self) - other return NotImplemented def __rsub__(self, other: object) -> float: - """other - :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``other - NA``). >>> 'str' - NA TypeError: unsupported operand type(s) for -: 'str' and 'NaType' @@ -238,14 +228,14 @@ class NaType(str): >>> 1.0 - NA nan """ - if isinstance(other, (int, float)): return other - float(self) return NotImplemented def __mul__(self, other: object) -> float: - """:const:`nvitop.NA` * other: Returns :data:`math.nan` if the operand is a number. A special - case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`. + """Return :data:`math.nan` if the operand is a number (``NA * other``). + + A special case is when the operand is :const:`nvitop.NA` itself, the result is also :data:`math.nan`. >>> NA * 1024 nan @@ -254,26 +244,24 @@ class NaType(str): >>> NA * NA nan """ - if isinstance(other, (int, float)) or other is NA: return float(self) * other return NotImplemented def __rmul__(self, other: object) -> float: - """other * :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``other * NA``). >>> 1024 * NA nan >>> 1024.0 * NA nan """ - if isinstance(other, (int, float)): return other * float(self) return NotImplemented def __truediv__(self, other: object) -> float: - """:const:`nvitop.NA` / other: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``NA / other``). >>> NA / 1024 nan @@ -284,26 +272,24 @@ class NaType(str): >>> NA / 0.0 ZeroDivisionError: float division by zero """ - if isinstance(other, (int, float)): return float(self) / other return NotImplemented def __rtruediv__(self, other: object) -> float: - """other / :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``other / NA``). >>> 1024 / NA nan >>> 1024.0 / NA nan """ - if isinstance(other, (int, float)): return other / float(self) return NotImplemented def __floordiv__(self, other: object) -> float: - """:const:`nvitop.NA` // other: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``NA // other``). >>> NA // 1024 nan @@ -314,26 +300,24 @@ class NaType(str): >>> NA / 0.0 ZeroDivisionError: float division by zero """ - if isinstance(other, (int, float)): return float(self) // other return NotImplemented def __rfloordiv__(self, other: object) -> float: - """other // :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``other // NA``). >>> 1024 // NA nan >>> 1024.0 // NA nan """ - if isinstance(other, (int, float)): return other // float(self) return NotImplemented def __mod__(self, other: object) -> float: - """:const:`nvitop.NA` % other: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``NA % other``). >>> NA % 1024 nan @@ -344,26 +328,24 @@ class NaType(str): >>> NA % 0.0 ZeroDivisionError: float modulo """ - if isinstance(other, (int, float)): return float(self) % other return NotImplemented def __rmod__(self, other: object) -> float: - """other % :const:`nvitop.NA`: Returns :data:`math.nan` if the operand is a number. + """Return :data:`math.nan` if the operand is a number (``other % NA``). >>> 1024 % NA nan >>> 1024.0 % NA nan """ - if isinstance(other, (int, float)): return other % float(self) return NotImplemented def __divmod__(self, other: object) -> Tuple[float, float]: - """divmod(:const:`nvitop.NA`, other): The pair (:const:`nvitop.NA` // other, :const:`nvitop.NA` % other). + """The pair ``(NA // other, NA % other)`` (``divmod(NA, other)``). >>> divmod(NA, 1024) (nan, nan) @@ -374,49 +356,44 @@ class NaType(str): >>> divmod(NA, 0.0) ZeroDivisionError: float floor division by zero """ - return (self // other, self % other) def __rdivmod__(self, other: object) -> Tuple[float, float]: - """divmod(other, :const:`nvitop.NA`): The pair (other // :const:`nvitop.NA`, other % :const:`nvitop.NA`). + """The pair ``(other // NA, other % NA)`` (``divmod(other, NA)``). >>> divmod(1024, NA) (nan, nan) >>> divmod(1024.0, NA) (nan, nan) """ - return (other // self, other % self) def __pos__(self) -> float: - """+:const:`nvitop.NA`: Returns :data:`math.nan`. + """Return :data:`math.nan` (``+NA``). >>> +NA nan """ - return +float(self) def __neg__(self) -> float: - """+:const:`nvitop.NA`: Returns :data:`math.nan`. + """Return :data:`math.nan` (``-NA``). >>> -NA nan """ - return -float(self) def __abs__(self) -> float: - """abs(NA): Returns :data:`math.nan`. + """Return :data:`math.nan` (``abs(NA)``). >>> abs(NA) nan """ - return abs(float(self)) def __round__(self, ndigits: Optional[int] = None) -> Union[int, float]: - """Rounds :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`. + """Round :const:`nvitop.NA` to ``ndigits`` decimal places, defaulting to :const:`0`. If ``ndigits`` is omitted or :data:`None`, returns :const:`0`, otherwise returns :data:`math.nan`. @@ -427,40 +404,36 @@ class NaType(str): >>> round(NA, 1) nan """ - if ndigits is None: return int(self) return round(float(self), ndigits) def __lt__(self, x: object) -> bool: - """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" - + """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__lt__(x) def __le__(self, x: object) -> bool: - """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" - + """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__le__(x) def __gt__(self, x: object) -> bool: - """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" - + """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__gt__(x) def __ge__(self, x: object) -> bool: - """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" - + """The :const:`nvitop.NA` is always greater than any number, or uses the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__ge__(x) def __format__(self, format_spec: str) -> str: + """Format :const:`nvitop.NA` according to ``format_spec``.""" try: return super().__format__(format_spec) except ValueError: @@ -515,8 +488,7 @@ SIZE_PATTERN = re.compile( def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-many-return-statements - """Converts bytes to a human readable string.""" - + """Convert bytes to a human readable string.""" if b == NA: return NA @@ -546,14 +518,13 @@ def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-man def human2bytes(s: Union[int, str]) -> int: - """Converts a human readable size string (*case insensitive*) to bytes. + """Convert a human readable size string (*case insensitive*) to bytes. Raises: ValueError: If cannot convert the given size string. Examples: - >>> human2bytes('500B') 500 >>> human2bytes('10k') @@ -567,7 +538,6 @@ def human2bytes(s: Union[int, str]) -> int: >>> human2bytes('1.5GiB') 1610612736 """ - if isinstance(s, int): if s >= 0: return s @@ -582,8 +552,7 @@ def human2bytes(s: Union[int, str]) -> int: def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str: - """Converts a number in seconds or a :class:`datetime.timedelta` instance to a human readable string.""" - + """Convert a number in seconds or a :class:`datetime.timedelta` instance to a human readable string.""" if isinstance(dt, (int, float)): dt = datetime.timedelta(seconds=dt) @@ -600,8 +569,7 @@ def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str: def utilization2string(utilization: Union[int, float, NaType]) -> str: - """Converts a utilization rate to string.""" - + """Convert a utilization rate to string.""" if utilization != NA: if isinstance(utilization, int): return f'{utilization}%' @@ -611,8 +579,7 @@ def utilization2string(utilization: Union[int, float, NaType]) -> str: def boolify(string: str, default: Any = None) -> bool: - """Converts the given value, usually a string, to boolean.""" - + """Convert the given value, usually a string, to boolean.""" if string.lower() in ('true', 'yes', 'on', 'enabled', '1'): return True if string.lower() in ('false', 'no', 'off', 'disabled', '0'): @@ -624,6 +591,7 @@ def boolify(string: str, default: Any = None) -> bool: class Snapshot: """A dict-like object holds the snapshot values. + The value can be accessed by ``snapshot.name`` or ``snapshot['name']`` syntax. The Snapshot can also be converted to a dictionary by ``dict(snapshot)`` or ``{**snapshot}``. @@ -631,12 +599,14 @@ class Snapshot: """ def __init__(self, real: Any, **items) -> None: + """Initialize a new :class:`Snapshot` object with the given attributes.""" self.real = real self.timestamp = time.time() for key, value in items.items(): setattr(self, key, value) def __str__(self) -> str: + """Return a string representation of the snapshot.""" keys = set(self.__dict__.keys()).difference({'real', 'timestamp'}) keys = ['real', *sorted(keys)] keyvals = [] @@ -653,13 +623,14 @@ class Snapshot: __repr__ = __str__ def __hash__(self) -> int: + """Return a hash value of the snapshot.""" return hash((self.real, self.timestamp)) def __getattr__(self, name: str) -> Any: - """Gets a member from the instance. + """Get a member from the instance. + If the attribute is not defined, fetches from the original object and makes a function call. """ - try: return super().__getattr__(name) except AttributeError: @@ -671,20 +642,18 @@ class Snapshot: return attribute def __getitem__(self, name: str) -> Any: - """Supports ``snapshot['name']`` syntax.""" - + """Support ``snapshot['name']`` syntax.""" try: return getattr(self, name) except AttributeError as ex: raise KeyError(name) from ex def __setitem__(self, name: str, value: Any) -> None: - """Supports ``snapshot['name'] = value`` syntax.""" - + """Support ``snapshot['name'] = value`` syntax.""" setattr(self, name, value) def __iter__(self) -> Iterable[str]: - """Supports ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax.""" + """Support ``for name in snapshot`` syntax and ``*`` tuple unpack ``[*snapshot]`` syntax.""" def gen() -> str: for name in self.__dict__: @@ -694,18 +663,17 @@ class Snapshot: return gen() def keys(self) -> Iterable[str]: - """Supports `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and - ``dict(snapshot)`` dictionary conversion. - """ - + # pylint: disable-next=line-too-long + """Support `**`` dictionary unpack ``{**snapshot}`` / ``dict(**snapshot)`` syntax and ``dict(snapshot)`` dictionary conversion.""" return iter(self) # Modified from psutil (https://github.com/giampaolo/psutil) def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]: - """A memoize decorator which is disabled by default. It can be activated and - deactivated on request. For efficiency reasons it can be used only against - class methods accepting no arguments. + """A memoize decorator which is disabled by default. + + It can be activated and deactivated on request. For efficiency reasons it can be used only + against class methods accepting no arguments. """ @functools.wraps(method) @@ -729,10 +697,10 @@ def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any] return ret def cache_activate(self): - """Activate cache. Expects a Process instance. Cache will be stored as - a "_cache" instance attribute. - """ + """Activate cache. + Expects an instance. Cache will be stored as a "_cache" instance attribute. + """ if not hasattr(self, '_cache'): setattr(self, '_cache', {}) diff --git a/nvitop/select.py b/nvitop/select.py index 4a3f442..3dfe317 100644 --- a/nvitop/select.py +++ b/nvitop/select.py @@ -54,8 +54,6 @@ Python API: ) """ # pylint: disable=line-too-long -# pylint: disable=missing-function-docstring - import argparse import getpass import math @@ -95,15 +93,13 @@ def select_devices( sort: bool = True, **kwargs, # fmt: skip # pylint: disable=unused-argument ) -> Union[List[int], List[Tuple[int, int]], List[str]]: - """Selected a subset of devices satisfying the specified criteria. Returns a list of the device - identifiers. + """Select a subset of devices satisfying the specified criteria. Note: The *min count* constraint may not be satisfied if the no enough devices are available. This constraint is only enforced when there are both MIG and non-MIG devices present. Examples: - Put the following lines to the top of your script: .. code-block:: python @@ -144,8 +140,10 @@ def select_devices( A list of accounts whose used GPU memory needs be considered as free memory. sort (bool): If :data:`True`, sort the selected devices by memory usage and GPU utilization. - """ + Returns: + A list of the device identifiers. + """ assert format in ('index', 'uuid', 'device') assert tolerance >= 0 tolerance = tolerance / 100.0 @@ -274,6 +272,8 @@ def select_devices( def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements + """Parse command-line arguments for ``nvisel``.""" + def non_negint(argstring): num = int(argstring) if num < 0: @@ -490,6 +490,7 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements def main(): + """Main function for ``nvisel`` CLI.""" args = parse_arguments() try: diff --git a/setup.py b/setup.py index 713cd18..eae2d57 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ # pip install 'nvitop[pynvml-xx.yyy.zz]' # -# pylint: disable=missing-module-docstring +"""Setup script for ``nvitop``.""" import pathlib import re