diff --git a/README.md b/README.md index f3de898..54fed4f 100644 --- a/README.md +++ b/README.md @@ -146,14 +146,20 @@ pip3 install . **NOTE:** If you encounter the *"nvitop: command not found"* error after installation, please check whether you have added the Python console script path (e.g., `"${HOME}/.local/bin"`) to your `PATH` environment variable. Alternatively, you can use `python3 -m nvitop`. -**IMPORTANT:** `pip` will install `nvidia-ml-py==11.450.51` as a dependency for `nvitop`. Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version. You can check the release history of `nvidia-ml-py` at [nvidia-ml-py's Release History](https://pypi.org/project/nvidia-ml-py/11.450.51/#history), and install the compatible version manually by: +**IMPORTANT:** `pip` will install `nvidia-ml-py>=11.450.51,<=11.495.46` as a dependency for `nvitop`. Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version. You can check the release history of `nvidia-ml-py` at [nvidia-ml-py's Release History](https://pypi.org/project/nvidia-ml-py/11.495.46/#history), and install the compatible version manually by: ```bash -pip3 install --no-dependencies nvidia-ml-py==xx.yyy.zzz +pip3 install --no-dependencies 'nvidia-ml-py==xx.yyy.zzz' ``` Since `nvidia-ml-py>=11.450.129`, the definition of `nvmlProcessInfo_t` has introduced two new fields `gpuInstanceId` and `computeInstanceId` (`GI ID` and `CI ID` in newer `nvidia-smi`) which are incompatible with some old NVIDIA drivers. `nvitop` may not display the processes correctly due to this incompatibility. +You can specified the version of `nvidia-ml-py` while installing `nvitop` as: + +```bash +pip3 install 'nvitop[pynvml-11.450.51]' # or 'nvitop[cuda10]' +``` + ## Usage ### Device and Process Status diff --git a/docs/source/apis/index.rst b/docs/source/apis/index.rst index 453d9e9..20367c7 100644 --- a/docs/source/apis/index.rst +++ b/docs/source/apis/index.rst @@ -15,6 +15,12 @@ Subpackages core/utils callbacks +.. automodule:: nvitop.version + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource + Module contents --------------- diff --git a/nvitop/cli.py b/nvitop/cli.py index 9459d9b..1833f22 100644 --- a/nvitop/cli.py +++ b/nvitop/cli.py @@ -208,10 +208,12 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo top.destroy() if len(nvml.UNKNOWN_FUNCTIONS) > 0: - messages.append('ERROR: A FunctionNotFound error occurred while calling:') - if len(nvml.UNKNOWN_FUNCTIONS) > 1: - messages[-1] = messages[-1].replace('A FunctionNotFound error', 'Some FunctionNotFound errors') - messages.extend([ + unknown_function_messages = [ + 'ERROR: Some FunctionNotFound errors occurred while calling:' + if len(nvml.UNKNOWN_FUNCTIONS) > 1 + else 'ERROR: A FunctionNotFound error occurred while calling:' + ] + unknown_function_messages.extend([ *list(map(' nvmlQuery({.__name__!r}, *args, **kwargs)'.format, nvml.UNKNOWN_FUNCTIONS)), ('Please verify whether the `{0}` package is compatible with your NVIDIA driver version.\n' 'You can check the release history of `{0}` and install the compatible version manually.\n' @@ -220,6 +222,20 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo colored('https://github.com/XuehaiPan/nvitop#installation', attrs=('underline',)) ) ]) + message = '\n'.join(unknown_function_messages) + if ( + 'nvmlDeviceGetComputeRunningProcesses' in message + or 'nvmlDeviceGetGraphicsRunningProcesses' in message + and Device.cuda_version().startswith('10.') + ): + message = '\n'.join(( + message, '', + 'You are using CUDA 10.x driver (yours is: @VERSION@) which is too old. Please contact', + 'your system admin to update the NVIDIA driver, or reinstall `nvitop` using:', + ' pip3 install "nvitop[cuda10]"' + )).replace('@VERSION@', Device.driver_version()) + messages.append(message) + if len(messages) > 0: for message in messages: if message.startswith('ERROR:'): diff --git a/nvitop/core/libnvml.py b/nvitop/core/libnvml.py index 36daa13..243b3f6 100644 --- a/nvitop/core/libnvml.py +++ b/nvitop/core/libnvml.py @@ -28,7 +28,7 @@ class libnvml: """Base exception class for NVML query errors.""" LOGGER = logging.getLogger('NVML') - UNKNOWN_FUNCTIONS = set() + UNKNOWN_FUNCTIONS = {} VERSIONED_PATTERN = re.compile(r'^(?P\w+)(?P_v(\d)+)$') c_nvmlDevice_t = pynvml.c_nvmlDevice_t @@ -226,11 +226,11 @@ class libnvml: raise nvml.NVMLError_FunctionNotFound from e retval = func(*args, **kwargs) - except nvml.NVMLError_FunctionNotFound: # pylint: disable=no-member + except nvml.NVMLError_FunctionNotFound as e: # pylint: disable=no-member if not ignore_function_not_found: with self._lock: if func not in self.UNKNOWN_FUNCTIONS: - self.UNKNOWN_FUNCTIONS.add(func) + self.UNKNOWN_FUNCTIONS[func] = e self.LOGGER.error( 'ERROR: A FunctionNotFound error occurred while calling %s.\n' 'Please verify whether the `nvidia-ml-py` package is ' diff --git a/nvitop/version.py b/nvitop/version.py index 161fe2e..9b48838 100644 --- a/nvitop/version.py +++ b/nvitop/version.py @@ -22,3 +22,42 @@ if not __release__: ).strip().lstrip('v').replace('-', '+', 1).replace('-', '.') except (OSError, subprocess.CalledProcessError): pass + + +# The package `nvidia-ml-py` is not backward compatible over releases. This may +# cause problems with Old versions of NVIDIA drivers. +# The ideal solution is to let the user install the best-fit version of `nvidia-ml-py`. +PYNVML_VERSION_CANDIDATES = [ + '11.450.51', # the last version supports the R430 driver (CUDA 10.x) + '11.450.129', # requires at last the R450 driver + '11.460.79', + '11.470.66', + '11.495.46', +] +"""The list of supported ``nvidia-ml-py`` versions. +See also: `nvidia-ml-py's Release History `_. + +To install ``nvitop`` with a specific version of ``nvidia-ml-py``, use ``nvitop[pynvml-xx.yyy.zzz]``, for example: + +.. code:: bash + + pip3 install 'nvitop[pynvml-11.450.51]' + +or + +.. code:: bash + + pip3 install nvitop nvidia-ml-py==11.450.51 + +Note: + The package ``nvidia-ml-py`` is not backward compatible over releases. This may cause problems + such as *"Function Not Found"* errors with Old versions of NVIDIA drivers (e.g. the NVIDIA R430 + driver on Ubuntu 16.04 LTS). + The ideal solution is to let the user install the best-fit version of ``nvidia-ml-py``. + + ``nvidia-ml-py==11.450.51`` is the last version supports the NVIDIA R430 driver (CUDA 10.x). + Since ``nvidia-ml-py>=11.450.129``, the definition of struct ``nvmlProcessInfo_t`` has introduced + two new fields ``gpuInstanceId`` and ``computeInstanceId`` (GI ID and CI ID in newer ``nvidia-smi``) + which are incompatible with some old NVIDIA drivers. ``nvitop`` may not display the processes + correctly due to this incompatibility. +""" diff --git a/requirements.txt b/requirements.txt index 68e918b..4ae7310 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -nvidia-ml-py == 11.450.51 +nvidia-ml-py >= 11.450.51, < 11.500.00 psutil >= 5.6.6 cachetools >= 1.0.1 termcolor >= 1.0.0 diff --git a/setup.cfg b/setup.cfg index 89ad665..377deda 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,8 @@ long_description = file: README.md long_description_content_type = text/markdown license = GPLv3 -license_file = LICENSE +license_files = + LICENSE url = https://github.com/XuehaiPan/nvitop project_urls = @@ -37,7 +38,7 @@ classifiers = [options] packages = find: install_requires = - nvidia-ml-py == 11.450.51 + nvidia-ml-py >= 11.450.51, < 11.500.00 psutil >= 5.6.6 cachetools >= 1.0.1 termcolor >= 1.0.0 diff --git a/setup.py b/setup.py index a50a9ef..39b587a 100755 --- a/setup.py +++ b/setup.py @@ -28,12 +28,28 @@ if not version.__release__: encoding='UTF-8') +# To install `nvitop` with specific version of `nvidia-ml-py`, use: +# +# pip install nvidia-ml-py==xx.yyy.zz nvitop +# +# or +# +# pip install 'nvitop[pynvml-xx.yyy.zz]' +# setup( name='nvitop', version=version.__version__, description=version.__doc__, author=version.__author__, author_email=version.__email__, + extras_require={ + 'cuda10': ['nvidia-ml-py == 11.450.51'], + **{ + # The identifier could not start with numbers, add a prefix `pynvml-` + 'pynvml-{}'.format(pynvml): ['nvidia-ml-py == {}'.format(pynvml)] + for pynvml in version.PYNVML_VERSION_CANDIDATES + } + } )