mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-21 06:45:24 -06:00
feat(select): support CUDA_VISIBLE_DEVICES conversion
Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
parent
dc562f503f
commit
8cba1de7aa
2 changed files with 55 additions and 24 deletions
20
README.md
20
README.md
|
|
@ -451,6 +451,13 @@ index, memory.free [MiB]
|
|||
1, 11018 MiB
|
||||
index, memory.free [MiB]
|
||||
0, 11018 MiB
|
||||
|
||||
# Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form)
|
||||
$ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S
|
||||
5,6
|
||||
$ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S -O uuid --newline
|
||||
GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1
|
||||
GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794
|
||||
```
|
||||
|
||||
You can also integrate `nvisel` into your training script like this:
|
||||
|
|
@ -468,11 +475,13 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
|||
Type `nvisel --help` for more command options:
|
||||
|
||||
```text
|
||||
usage: nvisel [--help] [--version] [--inherit] [--account-as-free [USERNAME ...]]
|
||||
usage: nvisel [--help] [--version]
|
||||
[--inherit [CUDA_VISIBLE_DEVICES]] [--account-as-free [USERNAME ...]]
|
||||
[--min-count N] [--max-count N] [--count N]
|
||||
[--min-free-memory SIZE] [--min-total-memory SIZE]
|
||||
[--max-gpu-utilization RATE] [--max-memory-utilization RATE]
|
||||
[--tolerance TOL] [--format FORMAT] [--sep SEP | --newline | --null]
|
||||
[--tolerance TOL]
|
||||
[--format FORMAT] [--sep SEP | --newline | --null] [--no-sort]
|
||||
|
||||
CUDA visible devices selection tool.
|
||||
|
||||
|
|
@ -481,8 +490,10 @@ optional arguments:
|
|||
--version, -V Show nvisel's version number and exit.
|
||||
|
||||
constraints:
|
||||
--inherit Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
This means selecting a subset of the currently CUDA-visible devices.
|
||||
--inherit [CUDA_VISIBLE_DEVICES], -i [CUDA_VISIBLE_DEVICES]
|
||||
Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the
|
||||
value from the environment. This means selecting a subset of the currently
|
||||
CUDA-visible devices.
|
||||
--account-as-free [USERNAME ...]
|
||||
Account the used GPU memory of the given users as free memory.
|
||||
If this option is specified but without argument, `$USER` will be used.
|
||||
|
|
@ -517,6 +528,7 @@ formatting:
|
|||
--newline Use newline character as separator for the output, equivalent to `--sep=$'\n'`.
|
||||
--null, -0 Use null character ('\x00') as separator for the output. This option corresponds
|
||||
to the `-0` option of `xargs`.
|
||||
--no-sort, -S Do not sort the device by memory usage and GPU utilization.
|
||||
```
|
||||
|
||||
------
|
||||
|
|
|
|||
|
|
@ -25,6 +25,9 @@ Command line usage:
|
|||
# Pipe output to other shell utilities
|
||||
nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv
|
||||
|
||||
# Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form)
|
||||
nvisel -i -S
|
||||
|
||||
Python API:
|
||||
|
||||
.. code-block:: python
|
||||
|
|
@ -36,12 +39,13 @@ Python API:
|
|||
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
||||
select_devices(format='uuid', min_count=4, min_free_memory='8GiB')
|
||||
)
|
||||
"""
|
||||
""" # pylint: disable=line-too-long
|
||||
|
||||
# pylint: disable=missing-function-docstring
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
|
@ -68,6 +72,7 @@ def select_devices(
|
|||
max_memory_utilization: Optional[int] = None, # in percentage
|
||||
tolerance: int = 0, # in percentage
|
||||
free_accounts: List[str] = None,
|
||||
sort: bool = True,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
) -> Union[List[int], List[Tuple[int, int]], List[str]]:
|
||||
"""Selected a subset of devices satisfying the specified criteria. Returns a list of the device
|
||||
|
|
@ -116,6 +121,8 @@ def select_devices(
|
|||
The tolerance rate (*in percentage*) to loose the constraints.
|
||||
free_accounts (List[str]):
|
||||
A list of accounts whose used GPU memory needs be considered as free memory.
|
||||
sort (bool):
|
||||
If :data:`True`, sort the selected devices by memory usage and GPU utilization.
|
||||
"""
|
||||
|
||||
assert format in ('index', 'uuid', 'device')
|
||||
|
|
@ -142,10 +149,6 @@ def select_devices(
|
|||
available_devices.extend(map(lambda device: device.as_snapshot(), device.to_leaf_devices()))
|
||||
for device in available_devices:
|
||||
device.loosen_constraints = 0
|
||||
for key in device:
|
||||
value = device[key]
|
||||
if not libnvml.nvmlCheckReturn(value):
|
||||
device[key] = float(value) # convert `NA`` to `math.nan`
|
||||
|
||||
if len(free_accounts) > 0:
|
||||
with GpuProcess.failsafe():
|
||||
|
|
@ -212,17 +215,18 @@ def select_devices(
|
|||
available_devices,
|
||||
)
|
||||
|
||||
available_devices = sorted(
|
||||
available_devices,
|
||||
key=lambda device: (
|
||||
device.loosen_constraints,
|
||||
(not math.isnan(device.memory_free), -device.memory_free), # descending
|
||||
(not math.isnan(device.memory_used), -device.memory_used), # descending
|
||||
(not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending
|
||||
(not math.isnan(device.memory_utilization), device.memory_utilization), # ascending
|
||||
-device.physical_index, # descending to keep <GPU 0> free
|
||||
),
|
||||
) # type: List[DeviceSnapshot]
|
||||
available_devices = list(available_devices)
|
||||
if sort:
|
||||
available_devices.sort(
|
||||
key=lambda device: (
|
||||
device.loosen_constraints,
|
||||
(not math.isnan(device.memory_free), -device.memory_free), # descending
|
||||
(not math.isnan(device.memory_used), -device.memory_used), # descending
|
||||
(not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending
|
||||
(not math.isnan(device.memory_utilization), device.memory_utilization), # ascending
|
||||
-device.physical_index, # descending to keep <GPU 0> free
|
||||
)
|
||||
)
|
||||
|
||||
if any(device.is_mig_device for device in available_devices): # found MIG devices!
|
||||
if min_count >= 2:
|
||||
|
|
@ -286,11 +290,16 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
|||
constraints = parser.add_argument_group('constraints')
|
||||
constraints.add_argument(
|
||||
'--inherit',
|
||||
'-i',
|
||||
dest='inherit',
|
||||
action='store_true',
|
||||
type=str,
|
||||
default=argparse.SUPPRESS,
|
||||
nargs='?',
|
||||
metavar='CUDA_VISIBLE_DEVICES',
|
||||
help=(
|
||||
'Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.\n'
|
||||
'This means selecting a subset of the currently CUDA-visible devices.'
|
||||
'Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the\n'
|
||||
'value from the environment. This means selecting a subset of the currently\n'
|
||||
'CUDA-visible devices.'
|
||||
),
|
||||
)
|
||||
constraints.add_argument(
|
||||
|
|
@ -436,6 +445,13 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
|
|||
'to the `-0` option of `xargs`.'
|
||||
),
|
||||
)
|
||||
formatter.add_argument(
|
||||
'--no-sort',
|
||||
'-S',
|
||||
dest='sort',
|
||||
action='store_false',
|
||||
help='Do not sort the device by memory usage and GPU utilization.',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
@ -459,7 +475,10 @@ def main():
|
|||
args = parse_arguments()
|
||||
|
||||
try:
|
||||
if args.inherit:
|
||||
if hasattr(args, 'inherit'):
|
||||
if args.inherit is not None:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = args.inherit
|
||||
|
||||
devices = Device.from_cuda_visible_devices()
|
||||
else:
|
||||
devices = Device.all()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue