feat(select): support CUDA_VISIBLE_DEVICES conversion

Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
Xuehai Pan 2022-07-29 17:47:42 +08:00
parent dc562f503f
commit 8cba1de7aa
2 changed files with 55 additions and 24 deletions

View file

@ -451,6 +451,13 @@ index, memory.free [MiB]
1, 11018 MiB
index, memory.free [MiB]
0, 11018 MiB
# Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form)
$ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S
5,6
$ nvisel -i "GPU-18ef14e9,GPU-849d5a8d" -S -O uuid --newline
GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1
GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794
```
You can also integrate `nvisel` into your training script like this:
@ -468,11 +475,13 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
Type `nvisel --help` for more command options:
```text
usage: nvisel [--help] [--version] [--inherit] [--account-as-free [USERNAME ...]]
usage: nvisel [--help] [--version]
[--inherit [CUDA_VISIBLE_DEVICES]] [--account-as-free [USERNAME ...]]
[--min-count N] [--max-count N] [--count N]
[--min-free-memory SIZE] [--min-total-memory SIZE]
[--max-gpu-utilization RATE] [--max-memory-utilization RATE]
[--tolerance TOL] [--format FORMAT] [--sep SEP | --newline | --null]
[--tolerance TOL]
[--format FORMAT] [--sep SEP | --newline | --null] [--no-sort]
CUDA visible devices selection tool.
@ -481,8 +490,10 @@ optional arguments:
--version, -V Show nvisel's version number and exit.
constraints:
--inherit Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.
This means selecting a subset of the currently CUDA-visible devices.
--inherit [CUDA_VISIBLE_DEVICES], -i [CUDA_VISIBLE_DEVICES]
Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the
value from the environment. This means selecting a subset of the currently
CUDA-visible devices.
--account-as-free [USERNAME ...]
Account the used GPU memory of the given users as free memory.
If this option is specified but without argument, `$USER` will be used.
@ -517,6 +528,7 @@ formatting:
--newline Use newline character as separator for the output, equivalent to `--sep=$'\n'`.
--null, -0 Use null character ('\x00') as separator for the output. This option corresponds
to the `-0` option of `xargs`.
--no-sort, -S Do not sort the device by memory usage and GPU utilization.
```
------

View file

@ -25,6 +25,9 @@ Command line usage:
# Pipe output to other shell utilities
nvisel -0 -O uuid -c 2 -f 4GiB | xargs -0 -I {} nvidia-smi --id={} --query-gpu=index,memory.free --format=csv
# Normalize the `CUDA_VISIBLE_DEVICES` environment variable (e.g. convert UUIDs to indices or get full UUIDs for an abbreviated form)
nvisel -i -S
Python API:
.. code-block:: python
@ -36,12 +39,13 @@ Python API:
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
select_devices(format='uuid', min_count=4, min_free_memory='8GiB')
)
"""
""" # pylint: disable=line-too-long
# pylint: disable=missing-function-docstring
import argparse
import math
import os
import sys
import warnings
from typing import Iterable, List, Optional, Tuple, Union
@ -68,6 +72,7 @@ def select_devices(
max_memory_utilization: Optional[int] = None, # in percentage
tolerance: int = 0, # in percentage
free_accounts: List[str] = None,
sort: bool = True,
**kwargs, # pylint: disable=unused-argument
) -> Union[List[int], List[Tuple[int, int]], List[str]]:
"""Selected a subset of devices satisfying the specified criteria. Returns a list of the device
@ -116,6 +121,8 @@ def select_devices(
The tolerance rate (*in percentage*) to loose the constraints.
free_accounts (List[str]):
A list of accounts whose used GPU memory needs be considered as free memory.
sort (bool):
If :data:`True`, sort the selected devices by memory usage and GPU utilization.
"""
assert format in ('index', 'uuid', 'device')
@ -142,10 +149,6 @@ def select_devices(
available_devices.extend(map(lambda device: device.as_snapshot(), device.to_leaf_devices()))
for device in available_devices:
device.loosen_constraints = 0
for key in device:
value = device[key]
if not libnvml.nvmlCheckReturn(value):
device[key] = float(value) # convert `NA`` to `math.nan`
if len(free_accounts) > 0:
with GpuProcess.failsafe():
@ -212,17 +215,18 @@ def select_devices(
available_devices,
)
available_devices = sorted(
available_devices,
key=lambda device: (
device.loosen_constraints,
(not math.isnan(device.memory_free), -device.memory_free), # descending
(not math.isnan(device.memory_used), -device.memory_used), # descending
(not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending
(not math.isnan(device.memory_utilization), device.memory_utilization), # ascending
-device.physical_index, # descending to keep <GPU 0> free
),
) # type: List[DeviceSnapshot]
available_devices = list(available_devices)
if sort:
available_devices.sort(
key=lambda device: (
device.loosen_constraints,
(not math.isnan(device.memory_free), -device.memory_free), # descending
(not math.isnan(device.memory_used), -device.memory_used), # descending
(not math.isnan(device.gpu_utilization), device.gpu_utilization), # ascending
(not math.isnan(device.memory_utilization), device.memory_utilization), # ascending
-device.physical_index, # descending to keep <GPU 0> free
)
)
if any(device.is_mig_device for device in available_devices): # found MIG devices!
if min_count >= 2:
@ -286,11 +290,16 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
constraints = parser.add_argument_group('constraints')
constraints.add_argument(
'--inherit',
'-i',
dest='inherit',
action='store_true',
type=str,
default=argparse.SUPPRESS,
nargs='?',
metavar='CUDA_VISIBLE_DEVICES',
help=(
'Inherit the current `CUDA_VISIBLE_DEVICES` environment variable.\n'
'This means selecting a subset of the currently CUDA-visible devices.'
'Inherit the given `CUDA_VISIBLE_DEVICES`. If the argument is omitted, use the\n'
'value from the environment. This means selecting a subset of the currently\n'
'CUDA-visible devices.'
),
)
constraints.add_argument(
@ -436,6 +445,13 @@ def parse_arguments(): # pylint: disable=too-many-branches,too-many-statements
'to the `-0` option of `xargs`.'
),
)
formatter.add_argument(
'--no-sort',
'-S',
dest='sort',
action='store_false',
help='Do not sort the device by memory usage and GPU utilization.',
)
args = parser.parse_args()
@ -459,7 +475,10 @@ def main():
args = parse_arguments()
try:
if args.inherit:
if hasattr(args, 'inherit'):
if args.inherit is not None:
os.environ['CUDA_VISIBLE_DEVICES'] = args.inherit
devices = Device.from_cuda_visible_devices()
else:
devices = Device.all()