chore(core/libnvml): limit the number of saved FunctionNotFound exceptions in logging

Signed-off-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
This commit is contained in:
Xuehai Pan 2022-07-05 15:51:02 +08:00
parent 1459e0828f
commit cf6ab08e66
2 changed files with 45 additions and 35 deletions

View file

@ -213,15 +213,16 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo
if len(nvml.UNKNOWN_FUNCTIONS) > 1
else 'ERROR: A FunctionNotFound error occurred while calling:'
]
unknown_function_messages.extend([
*list(map(' nvmlQuery({.__name__!r}, *args, **kwargs)'.format, nvml.UNKNOWN_FUNCTIONS)),
('Please verify whether the `{0}` package is compatible with your NVIDIA driver version.\n'
'You can check the release history of `{0}` and install the compatible version manually.\n'
'See {1} for more information.').format(
colored('nvidia-ml-py', attrs=('bold',)),
colored('https://github.com/XuehaiPan/nvitop#installation', attrs=('underline',))
)
])
unknown_function_messages.extend(' nvmlQuery({.__name__!r}, *args, **kwargs)'.format(func)
for func, _ in nvml.UNKNOWN_FUNCTIONS.values())
unknown_function_messages.append('\n'.join((
'Please verify whether the `{0}` package is compatible with your NVIDIA driver version.',
'You can check the release history of `{0}` and install the compatible version manually.',
'See {1} for more information.',
)).format(
colored('nvidia-ml-py', attrs=('bold',)),
colored('https://github.com/XuehaiPan/nvitop#installation', attrs=('underline',))
))
message = '\n'.join(unknown_function_messages)
if (
'nvmlDeviceGetComputeRunningProcesses' in message

View file

@ -5,6 +5,7 @@
# pylint: disable=invalid-name
import inspect
import logging
import re
import threading
@ -29,6 +30,7 @@ class libnvml:
LOGGER = logging.getLogger('NVML')
UNKNOWN_FUNCTIONS = {}
UNKNOWN_FUNCTIONS_CACHE_SIZE = 1024
VERSIONED_PATTERN = re.compile(r'^(?P<name>\w+)(?P<suffix>_v(\d)+)$')
c_nvmlDevice_t = pynvml.c_nvmlDevice_t
@ -97,11 +99,11 @@ class libnvml:
NVMLError_DriverNotLoaded:
If NVIDIA driver is not loaded.
NVMLError_LibRmVersionMismatch:
If RM detects a driver/library version mismatch, usually after a upgrade for NVIDIA
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
AttributeError:
If cannot find function ``nvmlInitWithFlags``, usually the ``pynvml`` module is overridden
by other modules. Need to reinstall package ``nvidia-ml-py``.
If cannot find function ``nvmlInitWithFlags``, usually the ``pynvml`` module is
overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
"""
self.nvmlInitWithFlags(0)
@ -115,11 +117,11 @@ class libnvml:
NVMLError_DriverNotLoaded:
If NVIDIA driver is not loaded.
NVMLError_LibRmVersionMismatch:
If RM detects a driver/library version mismatch, usually after a upgrade for NVIDIA
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
AttributeError:
If cannot find function ``nvmlInitWithFlags``, usually the ``pynvml`` module is overridden
by other modules. Need to reinstall package ``nvidia-ml-py``.
If cannot find function ``nvmlInitWithFlags``, usually the ``pynvml`` module is
overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
"""
with self._lock:
@ -130,14 +132,14 @@ class libnvml:
try:
pynvml.nvmlInitWithFlags(flags)
except nvml.NVMLError_LibraryNotFound: # pylint: disable=no-member
message = (
'FATAL ERROR: NVIDIA Management Library (NVML) not found.\n'
'HINT: The NVIDIA Management Library ships with the NVIDIA display driver (available at\n'
' https://www.nvidia.com/Download/index.aspx), or can be downloaded as part of the\n'
' NVIDIA CUDA Toolkit (available at https://developer.nvidia.com/cuda-downloads).\n'
' The lists of OS platforms and NVIDIA-GPUs supported by the NVML library can be\n'
' found in the NVML API Reference at https://docs.nvidia.com/deploy/nvml-api.'
)
message = '\n'.join((
'FATAL ERROR: NVIDIA Management Library (NVML) not found.',
'HINT: The NVIDIA Management Library ships with the NVIDIA display driver (available at',
' https://www.nvidia.com/Download/index.aspx), or can be downloaded as part of the',
' NVIDIA CUDA Toolkit (available at https://developer.nvidia.com/cuda-downloads).',
' The lists of OS platforms and NVIDIA-GPUs supported by the NVML library can be',
' found in the NVML API Reference at https://docs.nvidia.com/deploy/nvml-api.',
))
for text, color, attrs in (('FATAL ERROR:', 'red', ('bold',)),
('HINT:', 'yellow', ('bold',)),
('https://www.nvidia.com/Download/index.aspx', None, ('underline',)),
@ -148,12 +150,12 @@ class libnvml:
self.LOGGER.critical(message)
raise
except AttributeError:
message = (
'FATAL ERROR: The dependency package `nvidia-ml-py` is corrupted. You may have installed\n'
' other packages overriding the module `pynvml`.\n'
'Please reinstall `nvitop` with command:\n'
' python3 -m pip install --force-reinstall nvitop'
)
message = '\n'.join((
'FATAL ERROR: The dependency package `nvidia-ml-py` is corrupted. You may have installed',
' other packages overriding the module `pynvml`.',
'Please reinstall `nvitop` with command:',
' python3 -m pip install --force-reinstall nvitop',
))
for text, color, attrs in (('FATAL ERROR:', 'red', ('bold',)),
('nvidia-ml-py', None, ('bold',)),
('pynvml', None, ('bold',)),
@ -176,7 +178,7 @@ class libnvml:
NVMLError_DriverNotLoaded:
If NVIDIA driver is not loaded.
NVMLError_LibRmVersionMismatch:
If RM detects a driver/library version mismatch, usually after a upgrade for NVIDIA
If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
driver without reloading the kernel module.
NVMLError_Uninitialized:
If NVML was not first initialized with ``nvmlInit()``.
@ -222,15 +224,22 @@ class libnvml:
if isinstance(func, str):
try:
func = getattr(self, func)
except AttributeError as e:
raise nvml.NVMLError_FunctionNotFound from e
except AttributeError as e1:
raise nvml.NVMLError_FunctionNotFound from e1 # pylint: disable=no-member
retval = func(*args, **kwargs)
except nvml.NVMLError_FunctionNotFound as e: # pylint: disable=no-member
except nvml.NVMLError_FunctionNotFound as e2: # pylint: disable=no-member
if not ignore_function_not_found:
if identifier.__name__ == '<lambda>':
identifier = inspect.getsource(func)
else:
identifier = repr(func)
with self._lock:
if func not in self.UNKNOWN_FUNCTIONS:
self.UNKNOWN_FUNCTIONS[func] = e
if (
identifier not in self.UNKNOWN_FUNCTIONS
and len(self.UNKNOWN_FUNCTIONS) < self.UNKNOWN_FUNCTIONS_CACHE_SIZE
):
self.UNKNOWN_FUNCTIONS[identifier] = (func, e2)
self.LOGGER.error(
'ERROR: A FunctionNotFound error occurred while calling %s.\n'
'Please verify whether the `nvidia-ml-py` package is '