mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 14:15:55 -06:00
feat(select): add CUDA visible devices selection tool (#28)
Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
parent
674c0df846
commit
08e33b7455
7 changed files with 806 additions and 60 deletions
112
README.md
112
README.md
|
|
@ -33,6 +33,7 @@ An interactive NVIDIA-GPU process viewer, the one-stop solution for GPU process
|
|||
- [For SSH Users](#for-ssh-users)
|
||||
- [Command Line Options and Environment Variables](#command-line-options-and-environment-variables)
|
||||
- [Keybindings for Monitor Mode](#keybindings-for-monitor-mode)
|
||||
- [CUDA Visible Devices Selection Tool](#cuda-visible-devices-selection-tool)
|
||||
- [Callback Functions for Machine Learning Frameworks](#callback-functions-for-machine-learning-frameworks)
|
||||
- [Callback for TensorFlow (Keras)](#callback-for-tensorflow-keras)
|
||||
- [Callback for PyTorch Lightning](#callback-for-pytorch-lightning)
|
||||
|
|
@ -399,6 +400,117 @@ echo 'set -gx NVITOP_MONITOR_MODE "full"' >> ~/.config/fish/config.fish
|
|||
|
||||
------
|
||||
|
||||
### CUDA Visible Devices Selection Tool
|
||||
|
||||
Automatically select `CUDA_VISIBLE_DEVICES` from the given criteria. Example usage of the CLI tool:
|
||||
|
||||
```console
|
||||
# All devices but sorted
|
||||
$ nvisel # or use `python3 -m nvitop.select`
|
||||
6,5,4,3,2,1,0,7,8
|
||||
|
||||
# A simple example to select 4 devices
|
||||
$ nvisel -n 4 # or use `python3 -m nvitop.select -n 4`
|
||||
6,5,4,3
|
||||
|
||||
# Select available devices that satisfy the given constraints
|
||||
$ nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60
|
||||
6,5,4
|
||||
|
||||
# Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel`
|
||||
$ export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)"
|
||||
CUDA_VISIBLE_DEVICES="6,5,4,3,2,1,0"
|
||||
|
||||
# Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable
|
||||
$ export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)"
|
||||
CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02"
|
||||
|
||||
# Pipe output to other shell utilities
|
||||
$ nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv
|
||||
CUDA_VISIBLE_DEVICES="GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0,GPU-2428d171-8684-5b64-830c-435cd972ec4a,GPU-6d2a57c9-7783-44bb-9f53-13f36282830a,GPU-f8e5a624-2c7e-417c-e647-b764d26d4733,GPU-f9ca790e-683e-3d56-00ba-8f654e977e02"
|
||||
index, memory.free [MiB]
|
||||
6, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
5, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
4, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
3, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
2, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
1, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
0, 11018 MiB
|
||||
```
|
||||
|
||||
You can also integrate `nvisel` into your training script like this:
|
||||
|
||||
```python
|
||||
# Put this at the top of the Python script
|
||||
import os
|
||||
from nvitop import select_devices
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
||||
select_devices(format='uuid', min_count=4, min_free_memory='8GiB')
|
||||
)
|
||||
```
|
||||
|
||||
Type `nvisel --help` for more command options:
|
||||
|
||||
```text
|
||||
usage: nvisel [--help] [--version] [--inherit] [--account-as-free [USERNAME ...]]
|
||||
[--min-count N] [--max-count N] [--count N]
|
||||
[--min-free-memory SIZE] [--min-total-memory SIZE]
|
||||
[--max-gpu-utilization RATE] [--max-memory-utilization RATE]
|
||||
[--tolerance TOL] [--format FORMAT] [--sep SEP | --newline | --null]
|
||||
|
||||
CUDA visible devices selection tool.
|
||||
|
||||
optional arguments:
|
||||
--help, -h Show this help message and exit.
|
||||
--version, -V Show nvisel's version number and exit.
|
||||
|
||||
constraints:
|
||||
--inherit Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
This means selecting a subset of the currently CUDA-visible devices.
|
||||
--account-as-free [USERNAME ...]
|
||||
Account the used GPU memory of the given users as free memory.
|
||||
If this option is specified but without argument, `$USER` will be used.
|
||||
--min-count N, -c N Minimum number of devices to select. (default: 0)
|
||||
The tool will fail (exit non-zero) if the requested resource is not available.
|
||||
--max-count N, -C N Maximum number of devices to select. (default: all devices)
|
||||
--count N, -n N Overriding both `--min-count N` and `--max-count N`.
|
||||
--min-free-memory SIZE, -f SIZE
|
||||
Minimum free memory of devices to select. (example value: 4GiB)
|
||||
If this constraint is given, check against all devices.
|
||||
--min-total-memory SIZE, -t SIZE
|
||||
Minimum total memory of devices to select. (example value: 10GiB)
|
||||
If this constraint is given, check against all devices.
|
||||
--max-gpu-utilization RATE, -G RATE
|
||||
Maximum GPU utilization rate of devices to select. (example value: 30)
|
||||
If this constraint is given, check against all devices.
|
||||
--max-memory-utilization RATE, -M RATE
|
||||
Maximum memory bandwidth utilization rate of devices to select. (example value: 50)
|
||||
If this constraint is given, check against all devices.
|
||||
--tolerance TOL, --tol TOL
|
||||
The constraints tolerance (in percentage). (default: 0, i.e., strict)
|
||||
This option can loose the constraints if the requested resource is not available.
|
||||
For example, set `--tolerance=20` will accept a device with only 4GiB of free
|
||||
memory when set `--min-free-memory=5GiB`.
|
||||
|
||||
formatting:
|
||||
--format FORMAT, -O FORMAT
|
||||
The output format of the selected device identifiers. (default: index)
|
||||
If any MIG device found, the output format will be fallback to `uuid`.
|
||||
--sep SEP, --separator SEP, -s SEP
|
||||
Separator for the output. (default: ',')
|
||||
--newline Use newline character as separator for the output, equivalent to `--sep=$'\n'`.
|
||||
--null, -0 Use null character ('\x00') as separator for the output, equivalent to `--sep=$'\0'`.
|
||||
```
|
||||
|
||||
------
|
||||
|
||||
### Callback Functions for Machine Learning Frameworks
|
||||
|
||||
`nvitop` provides two builtin callbacks for [TensorFlow (Keras)](https://www.tensorflow.org) and [PyTorch Lightning](https://pytorchlightning.ai).
|
||||
|
|
|
|||
37
nvisel.py
Executable file
37
nvisel.py
Executable file
|
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
# License: GNU GPL version 3.
|
||||
|
||||
"""CUDA visible devices selection tool.
|
||||
|
||||
Usage:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# All devices but sorted
|
||||
nvisel # or use `python3 -m nvitop.select`
|
||||
|
||||
# A simple example to select 4 devices
|
||||
nvisel -n 4 # or use `python3 -m nvitop.select -n 4`
|
||||
|
||||
# Select available devices that satisfy the given constraints
|
||||
nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60
|
||||
|
||||
# Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel`
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)"
|
||||
|
||||
# Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable
|
||||
export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)"
|
||||
|
||||
# Pipe output to other shell utilities
|
||||
nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from nvitop.select import main # pylint: disable=no-name-in-module
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
from nvitop import core
|
||||
from nvitop.core import *
|
||||
from nvitop.select import select_devices
|
||||
from nvitop.version import __version__
|
||||
|
||||
|
||||
__all__ = core.__all__.copy()
|
||||
__all__ = ['select_devices'] + core.__all__
|
||||
|
|
|
|||
|
|
@ -2334,8 +2334,3 @@ def parse_cuda_visible_devices_to_uuids(
|
|||
if isinstance(result, Exception):
|
||||
raise result
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for cuda_device in CudaDevice.all():
|
||||
print(cuda_device.uuid())
|
||||
|
|
|
|||
|
|
@ -8,8 +8,10 @@
|
|||
import datetime
|
||||
import functools
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Callable, Iterable, Optional, Union
|
||||
|
||||
from psutil import WINDOWS
|
||||
|
||||
|
|
@ -24,7 +26,9 @@ __all__ = [
|
|||
'GiB',
|
||||
'TiB',
|
||||
'PiB',
|
||||
'SIZE_UNITS',
|
||||
'bytes2human',
|
||||
'human2bytes',
|
||||
'timedelta2human',
|
||||
'utilization2string',
|
||||
'colored',
|
||||
|
|
@ -46,21 +50,31 @@ try:
|
|||
from termcolor import colored as _colored
|
||||
except ImportError:
|
||||
|
||||
def _colored(text, color=None, on_color=None, attrs=None): # pylint: disable=unused-argument
|
||||
def _colored( # pylint: disable=unused-argument
|
||||
text: str,
|
||||
color: Optional[str] = None,
|
||||
on_color: Optional[str] = None,
|
||||
attrs: Iterable[str] = None,
|
||||
) -> str:
|
||||
return text
|
||||
|
||||
|
||||
COLOR = sys.stdout.isatty()
|
||||
|
||||
|
||||
def set_color(value):
|
||||
def set_color(value: bool) -> None:
|
||||
"""Force enables text coloring."""
|
||||
|
||||
global COLOR # pylint: disable=global-statement
|
||||
COLOR = bool(value)
|
||||
|
||||
|
||||
def colored(text, color=None, on_color=None, attrs=None):
|
||||
def colored(
|
||||
text: str,
|
||||
color: Optional[str] = None,
|
||||
on_color: Optional[str] = None,
|
||||
attrs: Iterable[str] = None,
|
||||
) -> str:
|
||||
"""Colorizes text.
|
||||
|
||||
Available text colors:
|
||||
|
|
@ -86,57 +100,71 @@ def colored(text, color=None, on_color=None, attrs=None):
|
|||
class NaType(str):
|
||||
"""A singleton (:const:`str: 'N/A'`) class represents a not applicable value."""
|
||||
|
||||
def __new__(cls):
|
||||
def __new__(cls) -> 'NaType':
|
||||
"""Gets the singleton instance (:const:`nvitop.NA`)."""
|
||||
|
||||
if not hasattr(cls, '_instance'):
|
||||
cls._instance = super().__new__(cls, 'N/A')
|
||||
return cls._instance
|
||||
|
||||
def __bool__(self):
|
||||
"""``bool(NA)`` -> :data:`False`"""
|
||||
def __bool__(self) -> bool:
|
||||
"""Converts :const:`NA` to :class:`bool`.
|
||||
|
||||
>>> bool(NA)
|
||||
False
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
def __int__(self):
|
||||
"""``int(NA)`` -> :data:`0`"""
|
||||
def __int__(self) -> int:
|
||||
"""Converts :const:`NA` to :class:`int`.
|
||||
|
||||
>>> int(NA)
|
||||
0
|
||||
"""
|
||||
|
||||
return 0
|
||||
|
||||
def __float__(self):
|
||||
"""``float(NA)`` -> :data:`math.nan`"""
|
||||
def __float__(self) -> float:
|
||||
"""Converts :const:`NA` to :class:`float`.
|
||||
|
||||
>>> float(NA)
|
||||
nan
|
||||
>>> float(NA) is math.nan
|
||||
True
|
||||
"""
|
||||
|
||||
return math.nan
|
||||
|
||||
def __lt__(self, x):
|
||||
def __lt__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
if isinstance(x, (int, float)):
|
||||
return False
|
||||
return super().__lt__(x)
|
||||
|
||||
def __le__(self, x):
|
||||
def __le__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
if isinstance(x, (int, float)):
|
||||
return False
|
||||
return super().__le__(x)
|
||||
|
||||
def __gt__(self, x):
|
||||
def __gt__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
if isinstance(x, (int, float)):
|
||||
return True
|
||||
return super().__gt__(x)
|
||||
|
||||
def __ge__(self, x):
|
||||
def __ge__(self, x: object) -> bool:
|
||||
"""The :const:`nvitop.NA` is always greater than any number. Use the dictionary order for string."""
|
||||
|
||||
if isinstance(x, (int, float)):
|
||||
return True
|
||||
return super().__ge__(x)
|
||||
|
||||
def __format__(self, format_spec):
|
||||
def __format__(self, format_spec: str) -> str:
|
||||
try:
|
||||
return super().__format__(format_spec)
|
||||
except ValueError:
|
||||
|
|
@ -168,40 +196,97 @@ TiB = 1 << 40
|
|||
PiB = 1 << 50
|
||||
"""Pebibyte (1024 * 1024 * 1024 * 1024 * 1024)"""
|
||||
|
||||
SIZE_UNITS = {
|
||||
None: 1,
|
||||
'': 1,
|
||||
'B': 1,
|
||||
'KiB': KiB,
|
||||
'MiB': MiB,
|
||||
'GiB': GiB,
|
||||
'TiB': TiB,
|
||||
'PiB': PiB,
|
||||
'KB': 1000,
|
||||
'MB': 1000**2,
|
||||
'GB': 1000**3,
|
||||
'TB': 1000**4,
|
||||
'PB': 1000**4,
|
||||
}
|
||||
"""Units of storage and memory measurements."""
|
||||
SIZE_PATTERN = re.compile(
|
||||
r'^\s*\+?\s*(?P<size>\d+(?:\.\d+)?)\s*(?P<unit>[KMGTP]i?B?|B?)\s*$', flags=re.IGNORECASE
|
||||
)
|
||||
"""The regex pattern for human readable size."""
|
||||
|
||||
def bytes2human(x): # pylint: disable=too-many-return-statements
|
||||
|
||||
def bytes2human(b: Union[int, float, NaType]) -> str: # pylint: disable=too-many-return-statements
|
||||
"""Converts bytes to a human readable string."""
|
||||
|
||||
if x is None or x == NA:
|
||||
if b == NA:
|
||||
return NA
|
||||
|
||||
if not isinstance(x, int):
|
||||
if not isinstance(b, int):
|
||||
try:
|
||||
x = round(float(x))
|
||||
b = round(float(b))
|
||||
except ValueError:
|
||||
return NA
|
||||
|
||||
if x < KiB:
|
||||
return '{}B'.format(x)
|
||||
if x < MiB:
|
||||
return '{}KiB'.format(round(x / KiB))
|
||||
if x <= 20 * GiB:
|
||||
return '{}MiB'.format(round(x / MiB))
|
||||
if x < 100 * GiB:
|
||||
return '{:.2f}GiB'.format(round(x / GiB, 2))
|
||||
if x < 1000 * GiB:
|
||||
return '{:.1f}GiB'.format(round(x / GiB, 1))
|
||||
if x < 100 * TiB:
|
||||
return '{:.2f}TiB'.format(round(x / TiB, 2))
|
||||
if x < 1000 * TiB:
|
||||
return '{:.1f}TiB'.format(round(x / TiB, 1))
|
||||
if x < 100 * PiB:
|
||||
return '{:.2f}PiB'.format(round(x / PiB, 2))
|
||||
return '{:.1f}PiB'.format(round(x / PiB, 1))
|
||||
if b < KiB:
|
||||
return '{}B'.format(b)
|
||||
if b < MiB:
|
||||
return '{}KiB'.format(round(b / KiB))
|
||||
if b <= 20 * GiB:
|
||||
return '{}MiB'.format(round(b / MiB))
|
||||
if b < 100 * GiB:
|
||||
return '{:.2f}GiB'.format(round(b / GiB, 2))
|
||||
if b < 1000 * GiB:
|
||||
return '{:.1f}GiB'.format(round(b / GiB, 1))
|
||||
if b < 100 * TiB:
|
||||
return '{:.2f}TiB'.format(round(b / TiB, 2))
|
||||
if b < 1000 * TiB:
|
||||
return '{:.1f}TiB'.format(round(b / TiB, 1))
|
||||
if b < 100 * PiB:
|
||||
return '{:.2f}PiB'.format(round(b / PiB, 2))
|
||||
return '{:.1f}PiB'.format(round(b / PiB, 1))
|
||||
|
||||
|
||||
def timedelta2human(dt):
|
||||
"""Converts :class:`datetime.timedelta` instance to a human readable string."""
|
||||
def human2bytes(s: Union[int, str]) -> int:
|
||||
"""Converts a human readable size string (*case insensitive*) to bytes.
|
||||
|
||||
Raises:
|
||||
ValueError:
|
||||
If cannot convert the given size string.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> human2bytes('500B')
|
||||
500
|
||||
>>> human2bytes('10k')
|
||||
10000
|
||||
>>> human2bytes('10ki')
|
||||
10240
|
||||
>>> human2bytes('1M')
|
||||
1000000
|
||||
>>> human2bytes('1MiB')
|
||||
1048576
|
||||
>>> human2bytes('1.5GiB')
|
||||
1610612736
|
||||
"""
|
||||
|
||||
if isinstance(s, int):
|
||||
if s >= 0:
|
||||
return s
|
||||
raise ValueError('Cannot convert {!r} to bytes.'.format(s))
|
||||
|
||||
match = SIZE_PATTERN.match(s)
|
||||
if match is None:
|
||||
raise ValueError('Cannot convert {!r} to bytes.'.format(s))
|
||||
size, unit = match.groups()
|
||||
unit = unit.upper().replace('I', 'i').replace('B', '') + 'B'
|
||||
return int(float(size) * SIZE_UNITS[unit])
|
||||
|
||||
|
||||
def timedelta2human(dt: Union[int, float, datetime.timedelta, NaType]) -> str:
|
||||
"""Converts a number in seconds or a :class:`datetime.timedelta` instance to a human readable string."""
|
||||
|
||||
if isinstance(dt, (int, float)):
|
||||
dt = datetime.timedelta(seconds=dt)
|
||||
|
|
@ -218,7 +303,7 @@ def timedelta2human(dt):
|
|||
return '{:d}:{:02d}'.format(*divmod(seconds, 60))
|
||||
|
||||
|
||||
def utilization2string(utilization):
|
||||
def utilization2string(utilization: Union[int, float, NaType]) -> str:
|
||||
"""Converts a utilization rate to string."""
|
||||
|
||||
if utilization != NA:
|
||||
|
|
@ -229,7 +314,7 @@ def utilization2string(utilization):
|
|||
return NA
|
||||
|
||||
|
||||
def boolify(string, default=None):
|
||||
def boolify(string: str, default: Any = None) -> bool:
|
||||
"""Converts the given value, usually a string, to boolean."""
|
||||
|
||||
if string.lower() in ('true', 'yes', 'on', 'enabled', '1'):
|
||||
|
|
@ -248,13 +333,13 @@ class Snapshot:
|
|||
Missing attributes will be automatically fetched from the original object.
|
||||
"""
|
||||
|
||||
def __init__(self, real, **items):
|
||||
def __init__(self, real: Any, **items) -> None:
|
||||
self.real = real
|
||||
self.timestamp = time.time()
|
||||
for key, value in items.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self) -> str:
|
||||
keys = set(self.__dict__.keys()).difference({'real', 'timestamp'})
|
||||
keys = ['real', *sorted(keys)]
|
||||
keyvals = []
|
||||
|
|
@ -270,7 +355,10 @@ class Snapshot:
|
|||
|
||||
__repr__ = __str__
|
||||
|
||||
def __getattr__(self, name):
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.real, self.timestamp))
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""Gets a member from the instance.
|
||||
If the attribute is not defined, fetches from the original object and makes a function call.
|
||||
"""
|
||||
|
|
@ -285,41 +373,51 @@ class Snapshot:
|
|||
setattr(self, name, attribute)
|
||||
return attribute
|
||||
|
||||
def __getitem__(self, name):
|
||||
def __getitem__(self, name: str) -> Any:
|
||||
"""Supports ``dict['name']`` syntax."""
|
||||
|
||||
try:
|
||||
return self.__getattr__(name)
|
||||
return getattr(self, name)
|
||||
except AttributeError as ex:
|
||||
raise KeyError from ex
|
||||
raise KeyError(name) from ex
|
||||
|
||||
def __setitem__(self, name, value):
|
||||
def __setitem__(self, name: str, value: Any) -> None:
|
||||
"""Supports ``dict['name'] = value`` syntax."""
|
||||
|
||||
self.__setattr__(name, value)
|
||||
setattr(self, name, value)
|
||||
|
||||
def __iter__(self) -> Iterable[str]:
|
||||
"""Supports ``for name in dict`` syntax."""
|
||||
|
||||
def gen() -> str:
|
||||
for name in self.__dict__:
|
||||
if name not in ('real', 'timestamp'):
|
||||
yield name
|
||||
|
||||
return gen()
|
||||
|
||||
|
||||
# Modified from psutil (https://github.com/giampaolo/psutil)
|
||||
def memoize_when_activated(func):
|
||||
def memoize_when_activated(method: Callable[[Any], Any]) -> Callable[[Any], Any]:
|
||||
"""A memoize decorator which is disabled by default. It can be activated and
|
||||
deactivated on request. For efficiency reasons it can be used only against
|
||||
class methods accepting no arguments.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
@functools.wraps(method)
|
||||
def wrapped(self):
|
||||
try:
|
||||
# case 1: we previously entered oneshot() ctx
|
||||
ret = self._cache[func] # pylint: disable=protected-access
|
||||
ret = self._cache[method] # pylint: disable=protected-access
|
||||
except AttributeError:
|
||||
# case 2: we never entered oneshot() ctx
|
||||
return func(self)
|
||||
return method(self)
|
||||
except KeyError:
|
||||
# case 3: we entered oneshot() ctx but there's no cache
|
||||
# for this entry yet
|
||||
ret = func(self)
|
||||
ret = method(self)
|
||||
try:
|
||||
self._cache[func] = ret # pylint: disable=protected-access
|
||||
self._cache[method] = ret # pylint: disable=protected-access
|
||||
except AttributeError:
|
||||
# multi-threading race condition, see:
|
||||
# https://github.com/giampaolo/psutil/issues/1948
|
||||
|
|
|
|||
502
nvitop/select.py
Normal file
502
nvitop/select.py
Normal file
|
|
@ -0,0 +1,502 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
# License: GNU GPL version 3.
|
||||
|
||||
"""CUDA visible devices selection tool.
|
||||
|
||||
Command line usage:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# All devices but sorted
|
||||
nvisel # or use `python3 -m nvitop.select`
|
||||
|
||||
# A simple example to select 4 devices
|
||||
nvisel -n 4 # or use `python3 -m nvitop.select -n 4`
|
||||
|
||||
# Select available devices that satisfy the given constraints
|
||||
nvisel --min-count 2 --max-count 3 --min-free-memory 5GiB --max-gpu-utilization 60
|
||||
|
||||
# Set `CUDA_VISIBLE_DEVICES` environment variable using `nvisel`
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES="$(nvisel -c 1 -f 10GiB)"
|
||||
|
||||
# Use UUID strings in `CUDA_VISIBLE_DEVICES` environment variable
|
||||
export CUDA_VISIBLE_DEVICES="$(nvisel -O uuid -c 2 -f 5000M)"
|
||||
|
||||
# Pipe output to other shell utilities
|
||||
nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv
|
||||
|
||||
Python API:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Put this at the top of the Python script
|
||||
import os
|
||||
from nvitop import select_devices
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
||||
select_devices(format='uuid', min_count=4, min_free_memory='8GiB')
|
||||
)
|
||||
"""
|
||||
|
||||
# pylint: disable=missing-function-docstring
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import sys
|
||||
import warnings
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
||||
from nvitop.core import Device, GpuProcess, human2bytes, libnvml
|
||||
from nvitop.gui import USERNAME, colored
|
||||
from nvitop.version import __version__
|
||||
|
||||
|
||||
TTY = sys.stdout.isatty()
|
||||
|
||||
|
||||
# pylint: disable=too-many-branches,too-many-statements,too-many-locals
|
||||
def select_devices(
|
||||
devices: Iterable[Device] = None,
|
||||
*,
|
||||
format: str = 'index', # pylint: disable=redefined-builtin
|
||||
force_index: bool = False,
|
||||
min_count: int = 0,
|
||||
max_count: Optional[int] = None,
|
||||
min_free_memory: Optional[Union[int, str]] = None, # in bytes or human readable
|
||||
min_total_memory: Optional[Union[int, str]] = None, # in bytes or human readable
|
||||
max_gpu_utilization: Optional[int] = None, # in percentage
|
||||
max_memory_utilization: Optional[int] = None, # in percentage
|
||||
tolerance: int = 0, # in percentage
|
||||
free_accounts: List[str] = None,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
) -> Union[List[int], List[Tuple[int, int]], List[str]]:
|
||||
"""Selected a subset of devices satisfying the specified criteria. Returns a list of the device
|
||||
identifiers.
|
||||
|
||||
Note:
|
||||
The *min count* constraint may not be satisfied if the no enough devices are available. This
|
||||
constraint is only enforced when there are both MIG and non-MIG devices present.
|
||||
|
||||
Examples:
|
||||
|
||||
Put the following lines to the top of your script:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import os
|
||||
from nvitop import select_devices
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
||||
select_devices(format='uuid', min_count=4, min_free_memory='8GiB')
|
||||
)
|
||||
|
||||
Args:
|
||||
devices (Iterable[Device]):
|
||||
The device superset to select from. If not specified, use all devices as the superset.
|
||||
format (str):
|
||||
The format of the output. One of 'index', 'uuid', or 'device'. If gets any MIG device
|
||||
with format 'index' set, falls back to the 'uuid' format.
|
||||
force_index (bool):
|
||||
If :data:`True`, always use the device index as the output format when gets any MIG device.
|
||||
min_count (int):
|
||||
The minimum number of devices to select.
|
||||
max_count (Optional[int]):
|
||||
The maximum number of devices to select.
|
||||
min_free_memory (Optional[Union[int, str]]):
|
||||
The minimum free memory (an :class:`int` *in bytes* or a :class:`str` in human readable
|
||||
form) of the selected devices.
|
||||
min_total_memory (Optional[Union[int, str]]):
|
||||
The minimum total memory (an :class:`int` *in bytes* or a :class:`str` in human readable
|
||||
form) of the selected devices.
|
||||
max_gpu_utilization (Optional[int]):
|
||||
The maximum GPU utilization rate (*in percentage*) of the selected devices.
|
||||
max_memory_utilization (Optional[int]):
|
||||
The maximum memory bandwidth utilization rate (*in percentage*) of the selected devices.
|
||||
tolerance (int):
|
||||
The tolerance rate (*in percentage*) to loose the constraints.
|
||||
free_accounts (List[str]):
|
||||
A list of accounts whose used GPU memory needs be considered as free memory.
|
||||
"""
|
||||
|
||||
assert format in ('index', 'uuid', 'device')
|
||||
assert tolerance >= 0
|
||||
tolerance = tolerance / 100.0
|
||||
|
||||
if max_count is not None:
|
||||
if max_count == 0:
|
||||
return []
|
||||
assert max_count >= min_count >= 0
|
||||
|
||||
free_accounts = set(free_accounts or [])
|
||||
|
||||
if devices is None:
|
||||
devices = Device.all()
|
||||
|
||||
if isinstance(min_free_memory, str):
|
||||
min_free_memory = human2bytes(min_free_memory)
|
||||
if isinstance(min_total_memory, str):
|
||||
min_total_memory = human2bytes(min_total_memory)
|
||||
|
||||
available_devices = [] # type: Iterable[DeviceSnapshot]
|
||||
for device in devices:
|
||||
available_devices.extend(map(lambda device: device.as_snapshot(), device.to_leaf_devices()))
|
||||
for device in available_devices:
|
||||
device.loosen_constraints = 0
|
||||
for key in device:
|
||||
value = device[key]
|
||||
if not libnvml.nvmlCheckReturn(value):
|
||||
device[key] = float(value) # convert `NA`` to `math.nan`
|
||||
|
||||
if len(free_accounts) > 0:
|
||||
with GpuProcess.failsafe():
|
||||
for device in available_devices:
|
||||
as_free_memory = 0
|
||||
for process in device.real.processes().values():
|
||||
if process.username() in free_accounts:
|
||||
as_free_memory += process.gpu_memory()
|
||||
device.memory_free += as_free_memory
|
||||
device.memory_used -= as_free_memory
|
||||
|
||||
if min_free_memory is not None:
|
||||
loosen_min_free_memory = min_free_memory * (1.0 - tolerance)
|
||||
available_devices = filter(
|
||||
lambda device: (
|
||||
device.memory_free >= loosen_min_free_memory,
|
||||
setattr(
|
||||
device,
|
||||
'loosen_constraints',
|
||||
device.loosen_constraints + int(not device.memory_free >= min_free_memory),
|
||||
),
|
||||
)[0],
|
||||
available_devices,
|
||||
)
|
||||
if min_total_memory is not None:
|
||||
loosen_min_total_memory = min_total_memory * (1.0 - tolerance)
|
||||
available_devices = filter(
|
||||
lambda device: (
|
||||
device.memory_total >= loosen_min_total_memory,
|
||||
setattr(
|
||||
device,
|
||||
'loosen_constraints',
|
||||
device.loosen_constraints + int(not device.memory_total >= min_total_memory),
|
||||
),
|
||||
)[0],
|
||||
available_devices,
|
||||
)
|
||||
if max_gpu_utilization is not None:
|
||||
loosen_max_gpu_utilization = max_gpu_utilization + 100.0 * tolerance
|
||||
available_devices = filter(
|
||||
lambda device: (
|
||||
device.gpu_utilization <= loosen_max_gpu_utilization,
|
||||
setattr(
|
||||
device,
|
||||
'loosen_constraints',
|
||||
device.loosen_constraints
|
||||
+ int(not device.gpu_utilization <= max_gpu_utilization),
|
||||
),
|
||||
)[0],
|
||||
available_devices,
|
||||
)
|
||||
if max_memory_utilization is not None:
|
||||
loosen_max_memory_utilization = max_memory_utilization + 100.0 * tolerance
|
||||
available_devices = filter(
|
||||
lambda device: (
|
||||
device.memory_utilization <= loosen_max_memory_utilization,
|
||||
setattr(
|
||||
device,
|
||||
'loosen_constraints',
|
||||
device.loosen_constraints
|
||||
+ int(not device.memory_utilization <= max_memory_utilization),
|
||||
),
|
||||
)[0],
|
||||
available_devices,
|
||||
)
|
||||
|
||||
available_devices = sorted(
|
||||
available_devices,
|
||||
key=lambda device: (
|
||||
device.loosen_constraints,
|
||||
(not math.isnan(device.memory_free), -device.memory_free), # descending
|
||||
(not math.isnan(device.memory_used), -device.memory_used), # descending
|
||||
(not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending
|
||||
(not math.isnan(device.memory_utilization), device.memory_utilization), # ascending
|
||||
-device.physical_index, # descending to keep <GPU 0> free
|
||||
),
|
||||
) # type: List[DeviceSnapshot]
|
||||
|
||||
if any(device.is_mig_device for device in available_devices): # found MIG devices!
|
||||
if min_count >= 2:
|
||||
non_mig_devices = [device for device in available_devices if not device.is_mig_device]
|
||||
mig_devices = [device for device in available_devices if device.is_mig_device]
|
||||
if len(non_mig_devices) >= min_count or not available_devices[0].is_mig_device:
|
||||
available_devices = non_mig_devices
|
||||
else:
|
||||
available_devices = mig_devices[:1] # at most one MIG device is visible
|
||||
# Check again
|
||||
if any(device.is_mig_device for device in available_devices): # found MIG devices!
|
||||
if format == 'index' and not force_index:
|
||||
format = 'uuid'
|
||||
|
||||
available_devices = available_devices[:max_count]
|
||||
|
||||
if format == 'device':
|
||||
return [device.real for device in available_devices]
|
||||
|
||||
if format == 'uuid':
|
||||
identifiers = [device.uuid for device in available_devices] # type: List[str]
|
||||
else:
|
||||
identifiers = [
|
||||
device.index for device in available_devices
|
||||
] # type: List[int, Tuple[int, int]]
|
||||
return identifiers
|
||||
|
||||
|
||||
def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
||||
def non_negint(argstring):
|
||||
num = int(argstring)
|
||||
if num < 0:
|
||||
raise ValueError
|
||||
return num
|
||||
|
||||
non_negint.__name__ = 'non-negative integer'
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='nvisel',
|
||||
description='CUDA visible devices selection tool.',
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
add_help=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--help',
|
||||
'-h',
|
||||
dest='help',
|
||||
action='help',
|
||||
default=argparse.SUPPRESS,
|
||||
help='Show this help message and exit.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
'-V',
|
||||
dest='version',
|
||||
action='version',
|
||||
version='%(prog)s {}'.format(__version__),
|
||||
help="Show %(prog)s's version number and exit.",
|
||||
)
|
||||
|
||||
constraints = parser.add_argument_group('constraints')
|
||||
constraints.add_argument(
|
||||
'--inherit',
|
||||
dest='inherit',
|
||||
action='store_true',
|
||||
help=(
|
||||
'Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.\n'
|
||||
'This means selecting a subset of the currently CUDA-visible devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--account-as-free',
|
||||
dest='free_accounts',
|
||||
nargs='*',
|
||||
metavar='USERNAME',
|
||||
help=(
|
||||
'Account the used GPU memory of the given users as free memory.\n'
|
||||
'If this option is specified but without argument, `$USER` will be used.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--min-count',
|
||||
'-c',
|
||||
dest='min_count',
|
||||
type=non_negint,
|
||||
default=0,
|
||||
metavar='N',
|
||||
help=(
|
||||
'Minimum number of devices to select. (default: %(default)d)\n'
|
||||
'The tool will fail (exit non-zero) if the requested resource is not available.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--max-count',
|
||||
'-C',
|
||||
dest='max_count',
|
||||
type=non_negint,
|
||||
default=None,
|
||||
metavar='N',
|
||||
help='Maximum number of devices to select. (default: all devices)',
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--count',
|
||||
'-n',
|
||||
dest='count',
|
||||
type=non_negint,
|
||||
metavar='N',
|
||||
help='Overriding both `--min-count N` and `--max-count N`.',
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--min-free-memory',
|
||||
'-f',
|
||||
dest='min_free_memory',
|
||||
type=human2bytes,
|
||||
default=None,
|
||||
metavar='SIZE',
|
||||
help=(
|
||||
'Minimum free memory of devices to select. (example value: 4GiB)\n'
|
||||
'If this constraint is given, check against all devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--min-total-memory',
|
||||
'-t',
|
||||
dest='min_total_memory',
|
||||
type=human2bytes,
|
||||
default=None,
|
||||
metavar='SIZE',
|
||||
help=(
|
||||
'Minimum total memory of devices to select. (example value: 10GiB)\n'
|
||||
'If this constraint is given, check against all devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--max-gpu-utilization',
|
||||
'-G',
|
||||
dest='max_gpu_utilization',
|
||||
type=non_negint,
|
||||
default=None,
|
||||
metavar='RATE',
|
||||
help=(
|
||||
'Maximum GPU utilization rate of devices to select. (example value: 30)\n'
|
||||
'If this constraint is given, check against all devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--max-memory-utilization',
|
||||
'-M',
|
||||
dest='max_memory_utilization',
|
||||
type=non_negint,
|
||||
default=None,
|
||||
metavar='RATE',
|
||||
help=(
|
||||
'Maximum memory bandwidth utilization rate of devices to select. (example value: 50)\n'
|
||||
'If this constraint is given, check against all devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
'--tolerance',
|
||||
'--tol',
|
||||
dest='tolerance',
|
||||
type=non_negint,
|
||||
default=10,
|
||||
metavar='TOL',
|
||||
help=(
|
||||
'The constraints tolerance (in percentage). (default: 0, i.e., strict)\n'
|
||||
'This option can loose the constraints if the requested resource is not available.\n'
|
||||
'For example, set `--tolerance=20` will accept a device with only 4GiB of free\n'
|
||||
'memory when set `--min-free-memory=5GiB`.'
|
||||
),
|
||||
)
|
||||
|
||||
formatter = parser.add_argument_group('formatting')
|
||||
formatter.add_argument(
|
||||
'--format',
|
||||
'-O',
|
||||
dest='format',
|
||||
type=str,
|
||||
choices=('index', 'uuid'),
|
||||
default='index',
|
||||
metavar='FORMAT',
|
||||
help=(
|
||||
'The output format of the selected device identifiers. (default: %(default)s)\n'
|
||||
'If any MIG device found, the output format will be fallback to `uuid`.'
|
||||
),
|
||||
)
|
||||
separator = formatter.add_mutually_exclusive_group()
|
||||
separator.add_argument(
|
||||
'--sep',
|
||||
'--separator',
|
||||
'-s',
|
||||
dest='sep',
|
||||
type=str,
|
||||
default=',',
|
||||
nargs=1,
|
||||
metavar='SEP',
|
||||
help='Separator for the output. (default: %(default)r)',
|
||||
)
|
||||
separator.add_argument(
|
||||
'--newline',
|
||||
dest='newline',
|
||||
action='store_true',
|
||||
help=r"Use newline character as separator for the output, equivalent to `--sep=$'\n'`.",
|
||||
)
|
||||
separator.add_argument(
|
||||
'--null',
|
||||
'-0',
|
||||
dest='null',
|
||||
action='store_true',
|
||||
help=r"Use null character ('\x00') as separator for the output, equivalent to `--sep=$'\0'`.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.count is not None:
|
||||
args.min_count = args.max_count = args.count
|
||||
if args.max_count is not None and args.max_count < args.min_count:
|
||||
raise RuntimeError('Max count must be no less than min count.')
|
||||
|
||||
if args.newline:
|
||||
args.sep = '\n'
|
||||
elif args.null:
|
||||
args.sep = '\0'
|
||||
|
||||
if args.free_accounts is not None and len(args.free_accounts) == 0:
|
||||
args.free_accounts.append(USERNAME)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
try:
|
||||
if args.inherit:
|
||||
devices = Device.from_cuda_visible_devices()
|
||||
else:
|
||||
devices = Device.all()
|
||||
except libnvml.NVMLError_LibraryNotFound:
|
||||
return 1
|
||||
except libnvml.NVMLError as ex:
|
||||
print(
|
||||
'{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), ex),
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
except RuntimeError as ex:
|
||||
print(
|
||||
'{} {}'.format(
|
||||
colored('CUDA ERROR:', color='red', attrs=('bold',)),
|
||||
str(ex).replace('CUDA Error: ', ''),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 3
|
||||
|
||||
identifiers = select_devices(devices, **vars(args))
|
||||
identifiers = list(map(str, identifiers))
|
||||
result = args.sep.join(identifiers)
|
||||
|
||||
if not TTY:
|
||||
print('CUDA_VISIBLE_DEVICES="{}"'.format(','.join(identifiers)), file=sys.stderr)
|
||||
|
||||
retval = 0
|
||||
if len(identifiers) < args.min_count:
|
||||
warnings.warn('Not enough devices found.', RuntimeWarning)
|
||||
retval = 4
|
||||
|
||||
if args.sep == '\0':
|
||||
print(result, end='\0')
|
||||
else:
|
||||
print(result)
|
||||
return retval
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -51,6 +51,7 @@ dynamic = [
|
|||
|
||||
[project.scripts]
|
||||
nvitop = "nvitop.cli:main"
|
||||
nvisel = "nvitop.select:main"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/XuehaiPan/nvitop"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue