diff --git a/README.md b/README.md index d266bc9..f0f78c8 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ An interactive NVIDIA-GPU process viewer, the one-stop solution for GPU process - [For SSH Users](#for-ssh-users) - [Command Line Options and Environment Variables](#command-line-options-and-environment-variables) - [Keybindings for Monitor Mode](#keybindings-for-monitor-mode) + - [CUDA Visible Devices Selection Tool](#cuda-visible-devices-selection-tool) - [Callback Functions for Machine Learning Frameworks](#callback-functions-for-machine-learning-frameworks) - [Callback for TensorFlow (Keras)](#callback-for-tensorflow-keras) - [Callback for PyTorch Lightning](#callback-for-pytorch-lightning) @@ -399,6 +400,117 @@ echo 'set -gx NVITOP_MONITOR_MODE "full"' >> ~/.config/fish/config.fish ------ +### CUDA Visible Devices Selection Tool + +Automatically select `CUDA_VISIBLE_DEVICES` from the given criteria. Example usage of the CLI tool: + +```console +# All devices but sorted +$ nvisel # or use `python3 -m nvitop.select` +6,5,4,3,2,1,0,7,8 + +# A simple example to select 4 devices +$ nvisel -n 4 # or use `python3 -m nvitop.select -n 4` +6,5,4,3 + +# Select available devices that satisfy the given constraints +$ nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60 +6,5,4 + +# Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel` +$ export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)" +CUDA_VISIBLE_DEVICES="6,5,4,3,2,1,0" + +# Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable +$ export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)" +CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02" + +# Pipe output to other shell utilities +$ nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv +CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02" +index, memory.free [MiB] +6, 11018 MiB +index, memory.free [MiB] +5, 11018 MiB +index, memory.free [MiB] +4, 11018 MiB +index, memory.free [MiB] +3, 11018 MiB +index, memory.free [MiB] +2, 11018 MiB +index, memory.free [MiB] +1, 11018 MiB +index, memory.free [MiB] +0, 11018 MiB +``` + +You can also integrate `nvisel` into your training script like this: + +```python +# Put this at the top of the Python script +import os +from nvitop import select_devices + +os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( + select_devices(format='uuid', min_count=4, min_free_memory='8GiB') +) +``` + +Type `nvisel --help` for more command options: + +```text +usage: nvisel [--help] [--version] [--inherit] [--account-as-free [USERNAME ...]] + [--min-count N] [--max-count N] [--count N] + [--min-free-memory SIZE] [--min-total-memory SIZE] + [--max-gpu-utilization RATE] [--max-memory-utilization RATE] + [--tolerance TOL] [--format FORMAT] [--sep SEP | --newline | --null] + +CUDA visible devices selection tool. + +optional arguments: + --help, -h Show this help message and exit. + --version, -V Show nvisel's version number and exit. + +constraints: + --inherit Inherit the current `CUDA_VISIBLE_DEVICES` environment variable. + This means selecting a subset of the currently CUDA-visible devices. + --account-as-free [USERNAME ...] + Account the used GPU memory of the given users as free memory. + If this option is specified but without argument, `$USER` will be used. + --min-count N, -c N Minimum number of devices to select. (default: 0) + The tool will fail (exit non-zero) if the requested resource is not available. + --max-count N, -C N Maximum number of devices to select. (default: all devices) + --count N, -n N Overriding both `--min-count N` and `--max-count N`. + --min-free-memory SIZE, -f SIZE + Minimum free memory of devices to select. (example value: 4GiB) + If this constraint is given, check against all devices. + --min-total-memory SIZE, -t SIZE + Minimum total memory of devices to select. (example value: 10GiB) + If this constraint is given, check against all devices. + --max-gpu-utilization RATE, -G RATE + Maximum GPU utilization rate of devices to select. (example value: 30) + If this constraint is given, check against all devices. + --max-memory-utilization RATE, -M RATE + Maximum memory bandwidth utilization rate of devices to select. (example value: 50) + If this constraint is given, check against all devices. + --tolerance TOL, --tol TOL + The constraints tolerance (in percentage). (default: 0, i.e., strict) + This option can loose the constraints if the requested resource is not available. + For example, set `--tolerance=20` will accept a device with only 4GiB of free + memory when set `--min-free-memory=5GiB`. + +formatting: + --format FORMAT, -O FORMAT + The output format of the selected device identifiers. (default: index) + If any MIG device found, the output format will be fallback to `uuid`. + --sep SEP, --separator SEP, -s SEP + Separator for the output. (default: ',') + --newline Use newline character as separator for the output, equivalent to `--sep=$'\n'`. + --null, -0 Use null character ('\x00') as separator for the output, equivalent to `--sep=$'\0'`. +``` + +------ + ### Callback Functions for Machine Learning Frameworks `nvitop` provides two builtin callbacks for [TensorFlow (Keras)](https://www.tensorflow.org) and [PyTorch Lightning](https://pytorchlightning.ai). diff --git a/nvisel.py b/nvisel.py new file mode 100755 index 0000000..b9d446e --- /dev/null +++ b/nvisel.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# License: GNU GPL version 3. + +"""CUDA visible devices selection tool. + +Usage: + +.. code-block:: bash + + # All devices but sorted + nvisel # or use `python3 -m nvitop.select` + + # A simple example to select 4 devices + nvisel -n 4 # or use `python3 -m nvitop.select -n 4` + + # Select available devices that satisfy the given constraints + nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60 + + # Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel` + export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)" + + # Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable + export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)" + + # Pipe output to other shell utilities + nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv +""" + +import sys + +from nvitop.select import main # pylint: disable=no-name-in-module + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/nvitop/__init__.py b/nvitop/__init__.py index 5a9ae27..2797da3 100644 --- a/nvitop/__init__.py +++ b/nvitop/__init__.py @@ -5,7 +5,8 @@ from nvitop import core from nvitop.core import * +from nvitop.select import select_devices from nvitop.version import __version__ -__all__ = core.__all__.copy() +__all__ = ['select_devices'] + core.__all__ diff --git a/nvitop/core/device.py b/nvitop/core/device.py index c2fb314..c87c22b 100644 --- a/nvitop/core/device.py +++ b/nvitop/core/device.py @@ -2334,8 +2334,3 @@ def parse_cuda_visible_devices_to_uuids( if isinstance(result, Exception): raise result return result - - -if __name__ == '__main__': - for cuda_device in CudaDevice.all(): - print(cuda_device.uuid()) diff --git a/nvitop/core/utils.py b/nvitop/core/utils.py index d34464a..44b58f7 100644 --- a/nvitop/core/utils.py +++ b/nvitop/core/utils.py @@ -8,8 +8,10 @@ import datetime import functools import math +import re import sys import time +from typing import Any, Callable, Iterable, Optional, Union from psutil import WINDOWS @@ -24,7 +26,9 @@ __all__ = [ 'GiB', 'TiB', 'PiB', + 'SIZE_UNITS', 'bytes2human', + 'human2bytes', 'timedelta2human', 'utilization2string', 'colored', @@ -46,21 +50,31 @@ try: from termcolor import colored as _colored except ImportError: - def _colored(text, color=None, on_color=None, attrs=None): # pylint: disable=unused-argument + def _colored( # pylint: disable=unused-argument + text: str, + color: Optional[str] = None, + on_color: Optional[str] = None, + attrs: Iterable[str] = None, + ) -> str: return text COLOR = sys.stdout.isatty() -def set_color(value): +def set_color(value: bool) -> None: """Force enables text coloring.""" global COLOR # pylint: disable=global-statement COLOR = bool(value) -def colored(text, color=None, on_color=None, attrs=None): +def colored( + text: str, + color: Optional[str] = None, + on_color: Optional[str] = None, + attrs: Iterable[str] = None, +) -> str: """Colorizes text. Available text colors: @@ -86,57 +100,71 @@ def colored(text, color=None, on_color=None, attrs=None): class NaType(str): """A singleton (:const:`str: 'N/A'`) class represents a not applicable value.""" - def __new__(cls): + def __new__(cls) -> 'NaType': """Gets the singleton instance (:const:`nvitop.NA`).""" if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls, 'N/A') return cls._instance - def __bool__(self): - """``bool(NA)`` -> :data:`False`""" + def __bool__(self) -> bool: + """Converts :const:`NA` to :class:`bool`. + + >>> bool(NA) + False + """ return False - def __int__(self): - """``int(NA)`` -> :data:`0`""" + def __int__(self) -> int: + """Converts :const:`NA` to :class:`int`. + + >>> int(NA) + 0 + """ return 0 - def __float__(self): - """``float(NA)`` -> :data:`math.nan`""" + def __float__(self) -> float: + """Converts :const:`NA` to :class:`float`. + + >>> float(NA) + nan + >>> float(NA) is math.nan + True + """ return math.nan - def __lt__(self, x): + def __lt__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__lt__(x) - def __le__(self, x): + def __le__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" if isinstance(x, (int, float)): return False return super().__le__(x) - def __gt__(self, x): + def __gt__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__gt__(x) - def __ge__(self, x): + def __ge__(self, x: object) -> bool: """The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string.""" if isinstance(x, (int, float)): return True return super().__ge__(x) - def __format__(self, format_spec): + def __format__(self, format_spec: str) -> str: try: return super().__format__(format_spec) except ValueError: @@ -168,40 +196,97 @@ TiB = 1 << 40 PiB = 1 << 50 """Pebibyte (1024 * 1024 * 1024 * 1024 * 1024)""" +SIZE_UNITS = { + None: 1, + '': 1, + 'B': 1, + 'KiB': KiB, + 'MiB': MiB, + 'GiB': GiB, + 'TiB': TiB, + 'PiB': PiB, + 'KB': 1000, + 'MB': 1000**2, + 'GB': 1000**3, + 'TB': 1000**4, + 'PB': 1000**4, +} +"""Units of storage and memory measurements.""" +SIZE_PATTERN = re.compile( + r'^\s*\+?\s*(?P\d+(?:\.\d+)?)\s*(?P[KMGTP]i?B?|B?)\s*$', flags=re.IGNORECASE +) +"""The regex pattern for human readable size.""" -def bytes2human(x): # pylint: disable=too-many-return-statements + +def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-many-return-statements """Converts bytes to a human readable string.""" - if x is None or x == NA: + if b == NA: return NA - if not isinstance(x, int): + if not isinstance(b, int): try: - x = round(float(x)) + b = round(float(b)) except ValueError: return NA - if x < KiB: - return '{}B'.format(x) - if x < MiB: - return '{}KiB'.format(round(x / KiB)) - if x <= 20 * GiB: - return '{}MiB'.format(round(x / MiB)) - if x < 100 * GiB: - return '{:.2f}GiB'.format(round(x / GiB, 2)) - if x < 1000 * GiB: - return '{:.1f}GiB'.format(round(x / GiB, 1)) - if x < 100 * TiB: - return '{:.2f}TiB'.format(round(x / TiB, 2)) - if x < 1000 * TiB: - return '{:.1f}TiB'.format(round(x / TiB, 1)) - if x < 100 * PiB: - return '{:.2f}PiB'.format(round(x / PiB, 2)) - return '{:.1f}PiB'.format(round(x / PiB, 1)) + if b < KiB: + return '{}B'.format(b) + if b < MiB: + return '{}KiB'.format(round(b / KiB)) + if b <= 20 * GiB: + return '{}MiB'.format(round(b / MiB)) + if b < 100 * GiB: + return '{:.2f}GiB'.format(round(b / GiB, 2)) + if b < 1000 * GiB: + return '{:.1f}GiB'.format(round(b / GiB, 1)) + if b < 100 * TiB: + return '{:.2f}TiB'.format(round(b / TiB, 2)) + if b < 1000 * TiB: + return '{:.1f}TiB'.format(round(b / TiB, 1)) + if b < 100 * PiB: + return '{:.2f}PiB'.format(round(b / PiB, 2)) + return '{:.1f}PiB'.format(round(b / PiB, 1)) -def timedelta2human(dt): - """Converts :class:`datetime.timedelta` instance to a human readable string.""" +def human2bytes(s: Union[int, str]) -> int: + """Converts a human readable size string (*case insensitive*) to bytes. + + Raises: + ValueError: + If cannot convert the given size string. + + Examples: + + >>> human2bytes('500B') + 500 + >>> human2bytes('10k') + 10000 + >>> human2bytes('10ki') + 10240 + >>> human2bytes('1M') + 1000000 + >>> human2bytes('1MiB') + 1048576 + >>> human2bytes('1.5GiB') + 1610612736 + """ + + if isinstance(s, int): + if s >= 0: + return s + raise ValueError('Cannot convert {!r} to bytes.'.format(s)) + + match = SIZE_PATTERN.match(s) + if match is None: + raise ValueError('Cannot convert {!r} to bytes.'.format(s)) + size, unit = match.groups() + unit = unit.upper().replace('I', 'i').replace('B', '') + 'B' + return int(float(size) * SIZE_UNITS[unit]) + + +def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str: + """Converts a number in seconds or a :class:`datetime.timedelta` instance to a human readable string.""" if isinstance(dt, (int, float)): dt = datetime.timedelta(seconds=dt) @@ -218,7 +303,7 @@ def timedelta2human(dt): return '{:d}:{:02d}'.format(*divmod(seconds, 60)) -def utilization2string(utilization): +def utilization2string(utilization: Union[int, float, NaType]) -> str: """Converts a utilization rate to string.""" if utilization != NA: @@ -229,7 +314,7 @@ def utilization2string(utilization): return NA -def boolify(string, default=None): +def boolify(string: str, default: Any = None) -> bool: """Converts the given value, usually a string, to boolean.""" if string.lower() in ('true', 'yes', 'on', 'enabled', '1'): @@ -248,13 +333,13 @@ class Snapshot: Missing attributes will be automatically fetched from the original object. """ - def __init__(self, real, **items): + def __init__(self, real: Any, **items) -> None: self.real = real self.timestamp = time.time() for key, value in items.items(): setattr(self, key, value) - def __str__(self): + def __str__(self) -> str: keys = set(self.__dict__.keys()).difference({'real', 'timestamp'}) keys = ['real', *sorted(keys)] keyvals = [] @@ -270,7 +355,10 @@ class Snapshot: __repr__ = __str__ - def __getattr__(self, name): + def __hash__(self) -> int: + return hash((self.real, self.timestamp)) + + def __getattr__(self, name: str) -> Any: """Gets a member from the instance. If the attribute is not defined, fetches from the original object and makes a function call. """ @@ -285,41 +373,51 @@ class Snapshot: setattr(self, name, attribute) return attribute - def __getitem__(self, name): + def __getitem__(self, name: str) -> Any: """Supports ``dict['name']`` syntax.""" try: - return self.__getattr__(name) + return getattr(self, name) except AttributeError as ex: - raise KeyError from ex + raise KeyError(name) from ex - def __setitem__(self, name, value): + def __setitem__(self, name: str, value: Any) -> None: """Supports ``dict['name'] = value`` syntax.""" - self.__setattr__(name, value) + setattr(self, name, value) + + def __iter__(self) -> Iterable[str]: + """Supports ``for name in dict`` syntax.""" + + def gen() -> str: + for name in self.__dict__: + if name not in ('real', 'timestamp'): + yield name + + return gen() # Modified from psutil (https://github.com/giampaolo/psutil) -def memoize_when_activated(func): +def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]: """A memoize decorator which is disabled by default. It can be activated and deactivated on request. For efficiency reasons it can be used only against class methods accepting no arguments. """ - @functools.wraps(func) + @functools.wraps(method) def wrapped(self): try: # case 1: we previously entered oneshot() ctx - ret = self._cache[func] # pylint: disable=protected-access + ret = self._cache[method] # pylint: disable=protected-access except AttributeError: # case 2: we never entered oneshot() ctx - return func(self) + return method(self) except KeyError: # case 3: we entered oneshot() ctx but there's no cache # for this entry yet - ret = func(self) + ret = method(self) try: - self._cache[func] = ret # pylint: disable=protected-access + self._cache[method] = ret # pylint: disable=protected-access except AttributeError: # multi-threading race condition, see: # https://github.com/giampaolo/psutil/issues/1948 diff --git a/nvitop/select.py b/nvitop/select.py new file mode 100644 index 0000000..a6b96b9 --- /dev/null +++ b/nvitop/select.py @@ -0,0 +1,502 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# License: GNU GPL version 3. + +"""CUDA visible devices selection tool. + +Command line usage: + +.. code-block:: bash + + # All devices but sorted + nvisel # or use `python3 -m nvitop.select` + + # A simple example to select 4 devices + nvisel -n 4 # or use `python3 -m nvitop.select -n 4` + + # Select available devices that satisfy the given constraints + nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60 + + # Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel` + export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)" + + # Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable + export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)" + + # Pipe output to other shell utilities + nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv + +Python API: + +.. code-block:: python + + # Put this at the top of the Python script + import os + from nvitop import select_devices + + os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( + select_devices(format='uuid', min_count=4, min_free_memory='8GiB') + ) +""" + +# pylint: disable=missing-function-docstring + +import argparse +import math +import sys +import warnings +from typing import Iterable, List, Optional, Tuple, Union + +from nvitop.core import Device, GpuProcess, human2bytes, libnvml +from nvitop.gui import USERNAME, colored +from nvitop.version import __version__ + + +TTY = sys.stdout.isatty() + + +# pylint: disable=too-many-branches,too-many-statements,too-many-locals +def select_devices( + devices: Iterable[Device] = None, + *, + format: str = 'index', # pylint: disable=redefined-builtin + force_index: bool = False, + min_count: int = 0, + max_count: Optional[int] = None, + min_free_memory: Optional[Union[int, str]] = None, # in bytes or human readable + min_total_memory: Optional[Union[int, str]] = None, # in bytes or human readable + max_gpu_utilization: Optional[int] = None, # in percentage + max_memory_utilization: Optional[int] = None, # in percentage + tolerance: int = 0, # in percentage + free_accounts: List[str] = None, + **kwargs, # pylint: disable=unused-argument +) -> Union[List[int], List[Tuple[int, int]], List[str]]: + """Selected a subset of devices satisfying the specified criteria. Returns a list of the device + identifiers. + + Note: + The *min count* constraint may not be satisfied if the no enough devices are available. This + constraint is only enforced when there are both MIG and non-MIG devices present. + + Examples: + + Put the following lines to the top of your script: + + .. code-block:: python + + import os + from nvitop import select_devices + + os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( + select_devices(format='uuid', min_count=4, min_free_memory='8GiB') + ) + + Args: + devices (Iterable[Device]): + The device superset to select from. If not specified, use all devices as the superset. + format (str): + The format of the output. One of 'index', 'uuid', or 'device'. If gets any MIG device + with format 'index' set, falls back to the 'uuid' format. + force_index (bool): + If :data:`True`, always use the device index as the output format when gets any MIG device. + min_count (int): + The minimum number of devices to select. + max_count (Optional[int]): + The maximum number of devices to select. + min_free_memory (Optional[Union[int, str]]): + The minimum free memory (an :class:`int` *in bytes* or a :class:`str` in human readable + form) of the selected devices. + min_total_memory (Optional[Union[int, str]]): + The minimum total memory (an :class:`int` *in bytes* or a :class:`str` in human readable + form) of the selected devices. + max_gpu_utilization (Optional[int]): + The maximum GPU utilization rate (*in percentage*) of the selected devices. + max_memory_utilization (Optional[int]): + The maximum memory bandwidth utilization rate (*in percentage*) of the selected devices. + tolerance (int): + The tolerance rate (*in percentage*) to loose the constraints. + free_accounts (List[str]): + A list of accounts whose used GPU memory needs be considered as free memory. + """ + + assert format in ('index', 'uuid', 'device') + assert tolerance >= 0 + tolerance = tolerance / 100.0 + + if max_count is not None: + if max_count == 0: + return [] + assert max_count >= min_count >= 0 + + free_accounts = set(free_accounts or []) + + if devices is None: + devices = Device.all() + + if isinstance(min_free_memory, str): + min_free_memory = human2bytes(min_free_memory) + if isinstance(min_total_memory, str): + min_total_memory = human2bytes(min_total_memory) + + available_devices = [] # type: Iterable[DeviceSnapshot] + for device in devices: + available_devices.extend(map(lambda device: device.as_snapshot(), device.to_leaf_devices())) + for device in available_devices: + device.loosen_constraints = 0 + for key in device: + value = device[key] + if not libnvml.nvmlCheckReturn(value): + device[key] = float(value) # convert `NA`` to `math.nan` + + if len(free_accounts) > 0: + with GpuProcess.failsafe(): + for device in available_devices: + as_free_memory = 0 + for process in device.real.processes().values(): + if process.username() in free_accounts: + as_free_memory += process.gpu_memory() + device.memory_free += as_free_memory + device.memory_used -= as_free_memory + + if min_free_memory is not None: + loosen_min_free_memory = min_free_memory * (1.0 - tolerance) + available_devices = filter( + lambda device: ( + device.memory_free >= loosen_min_free_memory, + setattr( + device, + 'loosen_constraints', + device.loosen_constraints + int(not device.memory_free >= min_free_memory), + ), + )[0], + available_devices, + ) + if min_total_memory is not None: + loosen_min_total_memory = min_total_memory * (1.0 - tolerance) + available_devices = filter( + lambda device: ( + device.memory_total >= loosen_min_total_memory, + setattr( + device, + 'loosen_constraints', + device.loosen_constraints + int(not device.memory_total >= min_total_memory), + ), + )[0], + available_devices, + ) + if max_gpu_utilization is not None: + loosen_max_gpu_utilization = max_gpu_utilization + 100.0 * tolerance + available_devices = filter( + lambda device: ( + device.gpu_utilization <= loosen_max_gpu_utilization, + setattr( + device, + 'loosen_constraints', + device.loosen_constraints + + int(not device.gpu_utilization <= max_gpu_utilization), + ), + )[0], + available_devices, + ) + if max_memory_utilization is not None: + loosen_max_memory_utilization = max_memory_utilization + 100.0 * tolerance + available_devices = filter( + lambda device: ( + device.memory_utilization <= loosen_max_memory_utilization, + setattr( + device, + 'loosen_constraints', + device.loosen_constraints + + int(not device.memory_utilization <= max_memory_utilization), + ), + )[0], + available_devices, + ) + + available_devices = sorted( + available_devices, + key=lambda device: ( + device.loosen_constraints, + (not math.isnan(device.memory_free), -device.memory_free), # descending + (not math.isnan(device.memory_used), -device.memory_used), # descending + (not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending + (not math.isnan(device.memory_utilization), device.memory_utilization), # ascending + -device.physical_index, # descending to keep free + ), + ) # type: List[DeviceSnapshot] + + if any(device.is_mig_device for device in available_devices): # found MIG devices! + if min_count >= 2: + non_mig_devices = [device for device in available_devices if not device.is_mig_device] + mig_devices = [device for device in available_devices if device.is_mig_device] + if len(non_mig_devices) >= min_count or not available_devices[0].is_mig_device: + available_devices = non_mig_devices + else: + available_devices = mig_devices[:1] # at most one MIG device is visible + # Check again + if any(device.is_mig_device for device in available_devices): # found MIG devices! + if format == 'index' and not force_index: + format = 'uuid' + + available_devices = available_devices[:max_count] + + if format == 'device': + return [device.real for device in available_devices] + + if format == 'uuid': + identifiers = [device.uuid for device in available_devices] # type: List[str] + else: + identifiers = [ + device.index for device in available_devices + ] # type: List[int, Tuple[int, int]] + return identifiers + + +def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements + def non_negint(argstring): + num = int(argstring) + if num < 0: + raise ValueError + return num + + non_negint.__name__ = 'non-negative integer' + + parser = argparse.ArgumentParser( + prog='nvisel', + description='CUDA visible devices selection tool.', + formatter_class=argparse.RawTextHelpFormatter, + add_help=False, + ) + parser.add_argument( + '--help', + '-h', + dest='help', + action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.', + ) + parser.add_argument( + '--version', + '-V', + dest='version', + action='version', + version='%(prog)s {}'.format(__version__), + help="Show %(prog)s's version number and exit.", + ) + + constraints = parser.add_argument_group('constraints') + constraints.add_argument( + '--inherit', + dest='inherit', + action='store_true', + help=( + 'Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.\n' + 'This means selecting a subset of the currently CUDA-visible devices.' + ), + ) + constraints.add_argument( + '--account-as-free', + dest='free_accounts', + nargs='*', + metavar='USERNAME', + help=( + 'Account the used GPU memory of the given users as free memory.\n' + 'If this option is specified but without argument, `$USER` will be used.' + ), + ) + constraints.add_argument( + '--min-count', + '-c', + dest='min_count', + type=non_negint, + default=0, + metavar='N', + help=( + 'Minimum number of devices to select. (default: %(default)d)\n' + 'The tool will fail (exit non-zero) if the requested resource is not available.' + ), + ) + constraints.add_argument( + '--max-count', + '-C', + dest='max_count', + type=non_negint, + default=None, + metavar='N', + help='Maximum number of devices to select. (default: all devices)', + ) + constraints.add_argument( + '--count', + '-n', + dest='count', + type=non_negint, + metavar='N', + help='Overriding both `--min-count N` and `--max-count N`.', + ) + constraints.add_argument( + '--min-free-memory', + '-f', + dest='min_free_memory', + type=human2bytes, + default=None, + metavar='SIZE', + help=( + 'Minimum free memory of devices to select. (example value: 4GiB)\n' + 'If this constraint is given, check against all devices.' + ), + ) + constraints.add_argument( + '--min-total-memory', + '-t', + dest='min_total_memory', + type=human2bytes, + default=None, + metavar='SIZE', + help=( + 'Minimum total memory of devices to select. (example value: 10GiB)\n' + 'If this constraint is given, check against all devices.' + ), + ) + constraints.add_argument( + '--max-gpu-utilization', + '-G', + dest='max_gpu_utilization', + type=non_negint, + default=None, + metavar='RATE', + help=( + 'Maximum GPU utilization rate of devices to select. (example value: 30)\n' + 'If this constraint is given, check against all devices.' + ), + ) + constraints.add_argument( + '--max-memory-utilization', + '-M', + dest='max_memory_utilization', + type=non_negint, + default=None, + metavar='RATE', + help=( + 'Maximum memory bandwidth utilization rate of devices to select. (example value: 50)\n' + 'If this constraint is given, check against all devices.' + ), + ) + constraints.add_argument( + '--tolerance', + '--tol', + dest='tolerance', + type=non_negint, + default=10, + metavar='TOL', + help=( + 'The constraints tolerance (in percentage). (default: 0, i.e., strict)\n' + 'This option can loose the constraints if the requested resource is not available.\n' + 'For example, set `--tolerance=20` will accept a device with only 4GiB of free\n' + 'memory when set `--min-free-memory=5GiB`.' + ), + ) + + formatter = parser.add_argument_group('formatting') + formatter.add_argument( + '--format', + '-O', + dest='format', + type=str, + choices=('index', 'uuid'), + default='index', + metavar='FORMAT', + help=( + 'The output format of the selected device identifiers. (default: %(default)s)\n' + 'If any MIG device found, the output format will be fallback to `uuid`.' + ), + ) + separator = formatter.add_mutually_exclusive_group() + separator.add_argument( + '--sep', + '--separator', + '-s', + dest='sep', + type=str, + default=',', + nargs=1, + metavar='SEP', + help='Separator for the output. (default: %(default)r)', + ) + separator.add_argument( + '--newline', + dest='newline', + action='store_true', + help=r"Use newline character as separator for the output, equivalent to `--sep=$'\n'`.", + ) + separator.add_argument( + '--null', + '-0', + dest='null', + action='store_true', + help=r"Use null character ('\x00') as separator for the output, equivalent to `--sep=$'\0'`.", + ) + + args = parser.parse_args() + + if args.count is not None: + args.min_count = args.max_count = args.count + if args.max_count is not None and args.max_count < args.min_count: + raise RuntimeError('Max count must be no less than min count.') + + if args.newline: + args.sep = '\n' + elif args.null: + args.sep = '\0' + + if args.free_accounts is not None and len(args.free_accounts) == 0: + args.free_accounts.append(USERNAME) + + return args + + +def main(): + args = parse_arguments() + + try: + if args.inherit: + devices = Device.from_cuda_visible_devices() + else: + devices = Device.all() + except libnvml.NVMLError_LibraryNotFound: + return 1 + except libnvml.NVMLError as ex: + print( + '{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), ex), + file=sys.stderr, + ) + return 2 + except RuntimeError as ex: + print( + '{} {}'.format( + colored('CUDA ERROR:', color='red', attrs=('bold',)), + str(ex).replace('CUDA Error: ', ''), + ), + file=sys.stderr, + ) + return 3 + + identifiers = select_devices(devices, **vars(args)) + identifiers = list(map(str, identifiers)) + result = args.sep.join(identifiers) + + if not TTY: + print('CUDA_VISIBLE_DEVICES="{}"'.format(','.join(identifiers)), file=sys.stderr) + + retval = 0 + if len(identifiers) < args.min_count: + warnings.warn('Not enough devices found.', RuntimeWarning) + retval = 4 + + if args.sep == '\0': + print(result, end='\0') + else: + print(result) + return retval + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 5d75f56..1f7099a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dynamic = [ [project.scripts] nvitop = "nvitop.cli:main" +nvisel = "nvitop.select:main" [project.urls] Homepage = "https://github.com/XuehaiPan/nvitop"