nvitop/nvitop/api/libnvml.py

# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for the NVML Python bindings (`nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_)."""

# pylint: disable=invalid-name

from __future__ import annotations

import atexit as _atexit
import ctypes as _ctypes
import inspect as _inspect
import logging as _logging
import os as _os
import re as _re
import sys as _sys
import threading as _threading
import time as _time
from types import FunctionType as _FunctionType
from types import ModuleType as _ModuleType
from typing import TYPE_CHECKING as _TYPE_CHECKING
from typing import Any as _Any
from typing import Callable as _Callable
from typing import ClassVar as _ClassVar

# Python Bindings for the NVIDIA Management Library (NVML)
# https://pypi.org/project/nvidia-ml-py
import pynvml as _pynvml
from pynvml import *  # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import
from pynvml import nvmlDeviceGetPciInfo  # appease mypy # noqa: F401 # pylint: disable=unused-import

from nvitop.api.utils import NA, UINT_MAX, ULONGLONG_MAX, NaType
from nvitop.api.utils import colored as __colored


if _TYPE_CHECKING:
    from typing_extensions import TypeAlias as _TypeAlias  # Python 3.10+


__all__ = [  # will be updated in below
    'NA',
    'UINT_MAX',
    'ULONGLONG_MAX',
    'nvmlCheckReturn',
    'nvmlQuery',
    'nvmlQueryFieldValues',
    'nvmlInit',
    'nvmlInitWithFlags',
    'nvmlShutdown',
    'NVMLError',
]


if not callable(getattr(_pynvml, 'nvmlInitWithFlags', None)):
    raise ImportError(
        'Your installed package `nvidia-ml-py` is corrupted. Please reinstall package '
        '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
    )


# Members from `pynvml` ############################################################################

NVMLError: type[_pynvml.NVMLError] = _pynvml.NVMLError
NVMLError.__doc__ = """Base exception class for NVML query errors."""
NVMLError.__new__.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""
nvmlExceptionClass: _Callable[[int], type[_pynvml.NVMLError]] = _pynvml.nvmlExceptionClass
nvmlExceptionClass.__doc__ = """Map value to a proper subclass of :class:`NVMLError`."""

# Load members from module `pynvml` and register them in `__all__` and globals.
_vars_pynvml = vars(_pynvml)
_name = _attr = None
_errcode_to_name = {}
_const_names = []
_errcode_to_string = NVMLError._errcode_to_string  # pylint: disable=protected-access

# 1. Put error classes in `__all__` first
for _name, _attr in _vars_pynvml.items():
    if _name in ('nvmlInit', 'nvmlInitWithFlags', 'nvmlShutdown'):
        continue
    if _name.startswith(('NVML_ERROR_', 'NVMLError_')):
        __all__.append(_name)
        if _name.startswith('NVML_ERROR_'):
            _errcode_to_name[_attr] = _name
            _const_names.append(_name)

# 2. Then the remaining members
for _name, _attr in _vars_pynvml.items():
    if _name in ('nvmlInit', 'nvmlInitWithFlags', 'nvmlShutdown'):
        continue
    if (_name.startswith('NVML_') and not _name.startswith('NVML_ERROR_')) or (
        _name.startswith('nvml') and isinstance(_attr, _FunctionType)
    ):
        __all__.append(_name)
        if _name.startswith('NVML_'):
            _const_names.append(_name)

# 3. Add docstring to exception classes
_errcode = _reason = _subclass = None
for _errcode, _reason in _errcode_to_string.items():
    _subclass = nvmlExceptionClass(_errcode)
    _subclass.__doc__ = '{}. Code: :data:`{}` ({})'.format(
        _reason.rstrip('.'),
        _errcode_to_name[_errcode],
        _errcode,
    )

# 4. Add undocumented constants into module docstring
_data_docs = []
_sphinx_doc = None
for _name in _const_names:
    _attr = _vars_pynvml[_name]
    _sphinx_doc = f"""
.. data:: {_name}
    :type: {_attr.__class__.__name__}
    :value: {_attr!r}
"""
    if _name.startswith('NVML_ERROR_') and _attr in _errcode_to_string:
        _reason = _errcode_to_string[_attr]
        _sphinx_doc += """
    {}. See also class :class:`NVMLError` and :class:`{}`.
""".format(_reason.rstrip('.'), nvmlExceptionClass(_attr).__name__)  # fmt: skip
    _data_docs.append(_sphinx_doc.strip())
__doc__ += """

---------

Constants
^^^^^^^^^

{}

---------

Functions and Exceptions
^^^^^^^^^^^^^^^^^^^^^^^^

.. function:: __enter__() -> libnvml

    Entry of the context manager for ``with`` statement.

.. function:: __exit__(*args, **kwargs) -> None

    Shutdown the NVML context in the context manager for ``with`` statement.

""".format('\n\n'.join(_data_docs))  # fmt: skip

del (
    _name,
    _attr,
    _vars_pynvml,
    _errcode,
    _reason,
    _subclass,
    _errcode_to_name,
    _errcode_to_string,
    _const_names,
    _data_docs,
    _sphinx_doc,
)


# 5. Add explicit references to appease linters
# pylint: disable=no-member
c_nvmlDevice_t: _TypeAlias = _pynvml.c_nvmlDevice_t
c_nvmlFieldValue_t: _TypeAlias = _pynvml.c_nvmlFieldValue_t
NVML_SUCCESS: int = _pynvml.NVML_SUCCESS
NVML_ERROR_INSUFFICIENT_SIZE: int = _pynvml.NVML_ERROR_INSUFFICIENT_SIZE
NVMLError_FunctionNotFound: _TypeAlias = _pynvml.NVMLError_FunctionNotFound
NVMLError_GpuIsLost: _TypeAlias = _pynvml.NVMLError_GpuIsLost
NVMLError_InvalidArgument: _TypeAlias = _pynvml.NVMLError_InvalidArgument
NVMLError_LibraryNotFound: _TypeAlias = _pynvml.NVMLError_LibraryNotFound
NVMLError_NoPermission: _TypeAlias = _pynvml.NVMLError_NoPermission
NVMLError_NotFound: _TypeAlias = _pynvml.NVMLError_NotFound
NVMLError_NotSupported: _TypeAlias = _pynvml.NVMLError_NotSupported
NVMLError_Unknown: _TypeAlias = _pynvml.NVMLError_Unknown
NVML_CLOCK_GRAPHICS: int = _pynvml.NVML_CLOCK_GRAPHICS
NVML_CLOCK_SM: int = _pynvml.NVML_CLOCK_SM
NVML_CLOCK_MEM: int = _pynvml.NVML_CLOCK_MEM
NVML_CLOCK_VIDEO: int = _pynvml.NVML_CLOCK_VIDEO
NVML_TEMPERATURE_GPU: int = _pynvml.NVML_TEMPERATURE_GPU
NVML_DRIVER_WDDM: int = _pynvml.NVML_DRIVER_WDDM
NVML_DRIVER_WDM: int = _pynvml.NVML_DRIVER_WDM
NVML_MEMORY_ERROR_TYPE_UNCORRECTED: int = _pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED
NVML_VOLATILE_ECC: int = _pynvml.NVML_VOLATILE_ECC
NVML_COMPUTEMODE_DEFAULT: int = _pynvml.NVML_COMPUTEMODE_DEFAULT
NVML_COMPUTEMODE_EXCLUSIVE_THREAD: int = _pynvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD
NVML_COMPUTEMODE_PROHIBITED: int = _pynvml.NVML_COMPUTEMODE_PROHIBITED
NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: int = _pynvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS
NVML_PCIE_UTIL_TX_BYTES: int = _pynvml.NVML_PCIE_UTIL_TX_BYTES
NVML_PCIE_UTIL_RX_BYTES: int = _pynvml.NVML_PCIE_UTIL_RX_BYTES
NVML_NVLINK_MAX_LINKS: int = _pynvml.NVML_NVLINK_MAX_LINKS
NVML_FI_DEV_NVLINK_LINK_COUNT: int = _pynvml.NVML_FI_DEV_NVLINK_LINK_COUNT
NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX
NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX
NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX
NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX: int = _pynvml.NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX
NVML_VALUE_TYPE_DOUBLE: int = _pynvml.NVML_VALUE_TYPE_DOUBLE
NVML_VALUE_TYPE_UNSIGNED_INT: int = _pynvml.NVML_VALUE_TYPE_UNSIGNED_INT
NVML_VALUE_TYPE_UNSIGNED_LONG: int = _pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG: int = _pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG_LONG
NVML_VALUE_TYPE_SIGNED_LONG_LONG: int = _pynvml.NVML_VALUE_TYPE_SIGNED_LONG_LONG
NVML_VALUE_TYPE_SIGNED_INT: int = _pynvml.NVML_VALUE_TYPE_SIGNED_INT
# pylint: enable=no-member

# New members in `libnvml` #########################################################################

__flags: list[int] = []
__initialized: bool = False
__lock: _threading.Lock = _threading.Lock()

LOGGER: _logging.Logger = _logging.getLogger(__name__)
try:
    LOGGER.setLevel(_os.getenv('LOGLEVEL', default='WARNING').upper())
except (ValueError, TypeError):
    pass
if not LOGGER.hasHandlers() and LOGGER.isEnabledFor(_logging.DEBUG):
    _formatter = _logging.Formatter(
        '[%(levelname)s] %(asctime)s %(name)s::%(funcName)s: %(message)s',
    )
    _stream_handler = _logging.StreamHandler()
    _stream_handler.setFormatter(_formatter)
    _file_handler = _logging.FileHandler('nvitop.log')
    _file_handler.setFormatter(_formatter)
    LOGGER.addHandler(_stream_handler)
    LOGGER.addHandler(_file_handler)
    del _formatter, _stream_handler, _file_handler

UNKNOWN_FUNCTIONS: dict[str, tuple[_Callable | str, NVMLError_FunctionNotFound]] = {}
UNKNOWN_FUNCTIONS_CACHE_SIZE: int = 1024
VERSIONED_PATTERN: _re.Pattern = _re.compile(r'^(?P<name>\w+)(?P<suffix>_v(\d)+)$')


def _lazy_init() -> None:
    """Lazily initialize the NVML context.

    Raises:
        NVMLError_LibraryNotFound:
            If cannot find the NVML library, usually the NVIDIA driver is not installed.
        NVMLError_DriverNotLoaded:
            If NVIDIA driver is not loaded.
        NVMLError_LibRmVersionMismatch:
            If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
            driver without reloading the kernel module.
        AttributeError:
            If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
            is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
    """
    with __lock:
        if __initialized:
            return
    nvmlInit()
    _atexit.register(nvmlShutdown)


def nvmlInit() -> None:  # pylint: disable=function-redefined
    """Initialize the NVML context with default flag (0).

    Raises:
        NVMLError_LibraryNotFound:
            If cannot find the NVML library, usually the NVIDIA driver is not installed.
        NVMLError_DriverNotLoaded:
            If NVIDIA driver is not loaded.
        NVMLError_LibRmVersionMismatch:
            If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
            driver without reloading the kernel module.
        AttributeError:
            If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
            is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
    """
    nvmlInitWithFlags(0)


def nvmlInitWithFlags(flags: int) -> None:  # pylint: disable=function-redefined
    """Initialize the NVML context with the given flags.

    Raises:
        NVMLError_LibraryNotFound:
            If cannot find the NVML library, usually the NVIDIA driver is not installed.
        NVMLError_DriverNotLoaded:
            If NVIDIA driver is not loaded.
        NVMLError_LibRmVersionMismatch:
            If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
            driver without reloading the kernel module.
        AttributeError:
            If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
            is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
    """
    global __flags, __initialized  # pylint: disable=global-statement,global-variable-not-assigned

    with __lock:
        if len(__flags) > 0 and flags == __flags[-1]:
            __initialized = True
            return

    try:
        _pynvml.nvmlInitWithFlags(flags)
    except NVMLError_LibraryNotFound:
        message = (
            'FATAL ERROR: NVIDIA Management Library (NVML) not found.\n'
            'HINT: The NVIDIA Management Library ships with the NVIDIA display driver (available at\n'
            '      https://www.nvidia.com/Download/index.aspx), or can be downloaded as part of the\n'
            '      NVIDIA CUDA Toolkit (available at https://developer.nvidia.com/cuda-downloads).\n'
            '      The lists of OS platforms and NVIDIA-GPUs supported by the NVML library can be\n'
            '      found in the NVML API Reference at https://docs.nvidia.com/deploy/nvml-api.'
        )
        for text, color, attrs in (
            ('FATAL ERROR:', 'red', ('bold',)),
            ('HINT:', 'yellow', ('bold',)),
            ('https://www.nvidia.com/Download/index.aspx', None, ('underline',)),
            ('https://developer.nvidia.com/cuda-downloads', None, ('underline',)),
            ('https://docs.nvidia.com/deploy/nvml-api', None, ('underline',)),
        ):
            message = message.replace(text, __colored(text, color=color, attrs=attrs))

        LOGGER.critical(message)
        raise
    except AttributeError:
        message = (
            'FATAL ERROR: The dependency package `nvidia-ml-py` is corrupted. You may have installed\n'
            '             other packages overriding the module `pynvml`.\n'
            'Please reinstall `nvitop` with command:\n'
            '    python3 -m pip install --force-reinstall nvitop'
        )
        for text, color, attrs in (
            ('FATAL ERROR:', 'red', ('bold',)),
            ('nvidia-ml-py', None, ('bold',)),
            ('pynvml', None, ('bold',)),
            ('nvitop', None, ('bold',)),
        ):
            message = message.replace(text, __colored(text, color=color, attrs=attrs), 1)

        LOGGER.critical(message)
        raise

    with __lock:
        __flags.append(flags)
        __initialized = True


def nvmlShutdown() -> None:  # pylint: disable=function-redefined
    """Shutdown the NVML context.

    Raises:
        NVMLError_LibraryNotFound:
            If cannot find the NVML library, usually the NVIDIA driver is not installed.
        NVMLError_DriverNotLoaded:
            If NVIDIA driver is not loaded.
        NVMLError_LibRmVersionMismatch:
            If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
            driver without reloading the kernel module.
        NVMLError_Uninitialized:
            If NVML was not first initialized with :func:`nvmlInit`.
    """
    global __flags, __initialized  # pylint: disable=global-statement,global-variable-not-assigned

    _pynvml.nvmlShutdown()
    with __lock:
        try:
            __flags.pop()
        except IndexError:
            pass
        __initialized = len(__flags) > 0


def nvmlQuery(
    func: _Callable[..., _Any] | str,
    *args: _Any,
    default: _Any = NA,
    ignore_errors: bool = True,
    ignore_function_not_found: bool = False,
    **kwargs: _Any,
) -> _Any:
    """Call a function with the given arguments from NVML.

    The NVML context will be automatically initialized.

    Args:
        func (Union[Callable[..., Any], str]):
            The function to call. If it is given by string, lookup for the function first from
            module :mod:`pynvml`.
        default (Any):
            The default value if the query fails.
        ignore_errors (bool):
            Whether to ignore errors and return the default value.
        ignore_function_not_found (bool):
            Whether to ignore function not found errors and return the default value. If set to
            :data:`False`, an error message will be logged to the logger.
        *args:
            Positional arguments to pass to the query function.
        **kwargs:
            Keyword arguments to pass to the query function.

    Raises:
        NVMLError_LibraryNotFound:
            If cannot find the NVML library, usually the NVIDIA driver is not installed.
        NVMLError_DriverNotLoaded:
            If NVIDIA driver is not loaded.
        NVMLError_LibRmVersionMismatch:
            If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA
            driver without reloading the kernel module.
        NVMLError_FunctionNotFound:
            If the function is not found, usually the installed ``nvidia-ml-py`` is not compatible
            with the installed NVIDIA driver.
        NVMLError_NotSupported:
            If the function is not supported by the driver or the device.
        NVMLError_InvalidArgument:
            If passed with an invalid argument.
    """
    global UNKNOWN_FUNCTIONS  # pylint: disable=global-statement,global-variable-not-assigned

    _lazy_init()

    try:
        if isinstance(func, str):
            try:
                func = getattr(__modself, func)
            except AttributeError as e1:
                raise NVMLError_FunctionNotFound from e1

        retval = func(*args, **kwargs)  # type: ignore[operator]
    except NVMLError_FunctionNotFound as e2:
        if not ignore_function_not_found:
            identifier = (
                func
                if isinstance(func, str)
                else (_inspect.getsource(func) if func.__name__ == '<lambda>' else repr(func))
            )
            with __lock:
                if (
                    identifier not in UNKNOWN_FUNCTIONS
                    and len(UNKNOWN_FUNCTIONS) < UNKNOWN_FUNCTIONS_CACHE_SIZE
                ):
                    UNKNOWN_FUNCTIONS[identifier] = (func, e2)
                    LOGGER.error(
                        (
                            'ERROR: A FunctionNotFound error occurred while calling %s.\n'
                            'Please verify whether the `nvidia-ml-py` package is '
                            'compatible with your NVIDIA driver version.'
                        ),
                        f'nvmlQuery({func!r}, *args, **kwargs)',
                    )
        if ignore_errors or ignore_function_not_found:
            return default
        raise
    except NVMLError:
        if ignore_errors:
            return default
        raise

    if isinstance(retval, bytes):
        retval = retval.decode('utf-8')
    return retval


def nvmlQueryFieldValues(
    handle: c_nvmlDevice_t,
    field_ids: list[int | tuple[int, int]],
) -> list[tuple[float | int | NaType, int]]:
    """Query multiple field values from NVML.

    Request values for a list of fields for a device. This API allows multiple fields to be queried
    at once. If any of the underlying fieldIds are populated by the same driver call, the results
    for those field IDs will be populated from a single call rather than making a driver call for
    each fieldId.

    Raises:
        NVMLError_InvalidArgument:
            If device or field_ids is invalid.
    """
    field_values = nvmlQuery('nvmlDeviceGetFieldValues', handle, field_ids)

    if not nvmlCheckReturn(field_values):
        timestamp = _time.time_ns() // 1000
        return [(NA, timestamp) for _ in range(len(field_ids))]

    values_with_timestamps: list[tuple[float | int | NaType, int]] = []
    for field_value in field_values:
        timestamp = field_value.timestamp
        if field_value.nvmlReturn != NVML_SUCCESS:
            value = NA
            timestamp = _time.time_ns() // 1000
        elif field_value.valueType == NVML_VALUE_TYPE_DOUBLE:
            value = field_value.value.dVal
        elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT:
            value = field_value.value.uiVal
        elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG:
            value = field_value.value.ulVal
        elif field_value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
            value = field_value.value.ullVal
        elif field_value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG:
            value = field_value.value.llVal
        elif field_value.valueType == NVML_VALUE_TYPE_SIGNED_INT:
            value = field_value.value.iVal
        else:
            value = NA
        values_with_timestamps.append((value, timestamp))
    return values_with_timestamps


def nvmlCheckReturn(
    retval: _Any,
    types: type | tuple[type, ...] | None = None,
) -> bool:
    """Check whether the return value is not :const:`nvitop.NA` and is one of the given types."""
    if types is None:
        return retval != NA
    return retval != NA and isinstance(retval, types)


# Patch layers for backward compatibility ##########################################################
_pynvml_installation_corrupted: bool = not callable(
    getattr(_pynvml, '_nvmlGetFunctionPointer', None),
)

# Patch function `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`
if not _pynvml_installation_corrupted:
    # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
    class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure):  # pylint: disable=protected-access
        _fields_: _ClassVar[list[tuple[str, type]]] = [
            # Process ID
            ('pid', _ctypes.c_uint),
            # Amount of used GPU memory in bytes.
            # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages
            # all the memory and not the NVIDIA driver.
            ('usedGpuMemory', _ctypes.c_ulonglong),
        ]
        _fmt_: _ClassVar[dict[str, str]] = {
            'usedGpuMemory': '%d B',
        }

    # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
    class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure):  # pylint: disable=protected-access
        _fields_: _ClassVar[list[tuple[str, type]]] = [
            # Process ID
            ('pid', _ctypes.c_uint),
            # Amount of used GPU memory in bytes.
            # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages
            # all the memory and not the NVIDIA driver.
            ('usedGpuMemory', _ctypes.c_ulonglong),
            # If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF
            # otherwise.
            ('gpuInstanceId', _ctypes.c_uint),
            # If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to
            # 0xFFFFFFFF otherwise.
            ('computeInstanceId', _ctypes.c_uint),
        ]
        _fmt_: _ClassVar[dict[str, str]] = {
            'usedGpuMemory': '%d B',
        }

    # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
    class c_nvmlProcessInfo_v3_t(_pynvml._PrintableStructure):  # pylint: disable=protected-access
        _fields_: _ClassVar[list[tuple[str, type]]] = [
            # Process ID
            ('pid', _ctypes.c_uint),
            # Amount of used GPU memory in bytes.
            # Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD manages
            # all the memory and not the NVIDIA driver.
            ('usedGpuMemory', _ctypes.c_ulonglong),
            # If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF
            # otherwise.
            ('gpuInstanceId', _ctypes.c_uint),
            # If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to
            # 0xFFFFFFFF otherwise.
            ('computeInstanceId', _ctypes.c_uint),
            # Amount of used GPU conf compute protected memory in bytes.
            ('usedGpuCcProtectedMemory', _ctypes.c_ulonglong),
        ]
        _fmt_: _ClassVar[dict[str, str]] = {
            'usedGpuMemory': '%d B',
            'usedGpuCcProtectedMemory': '%d B',
        }

    __get_running_processes_version_suffix = None
    c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t

    def __determine_get_running_processes_version_suffix() -> str:
        global __get_running_processes_version_suffix, c_nvmlProcessInfo_t  # pylint: disable=global-statement

        if __get_running_processes_version_suffix is None:
            # pylint: disable-next=protected-access,no-member
            _nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
            __get_running_processes_version_suffix = '_v3'
            try:
                _nvmlGetFunctionPointer('nvmlDeviceGetConfComputeMemSizeInfo')
            except NVMLError_FunctionNotFound:
                c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
                LOGGER.debug(
                    'NVML get running process version 3 API with v3 type struct is not available '
                    'due to incompatible NVIDIA driver. Fallback to use get running process '
                    'version 3 API with v2 type struct.',
                )
                try:
                    _nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v3')
                except NVMLError_FunctionNotFound:
                    __get_running_processes_version_suffix = '_v2'
                    LOGGER.debug(
                        'NVML get running process version 3 API with v2 type struct is not '
                        'available due to incompatible NVIDIA driver. Fallback to use get running '
                        'process version 2 API with v2 type struct.',
                    )
                    try:
                        _nvmlGetFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v2')
                    except NVMLError_FunctionNotFound:
                        c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
                        __get_running_processes_version_suffix = ''
                        LOGGER.debug(
                            'NVML get running process version 2 API with v2 type struct is not '
                            'available due to incompatible NVIDIA driver. Fallback to use get '
                            'running process version 1 API with v1 type struct.',
                        )
                    else:
                        LOGGER.debug(
                            'NVML get running process version 2 API with v2 type struct is '
                            'available.',
                        )
                else:
                    LOGGER.debug(
                        'NVML get running process version 3 API with v2 type struct is available.',
                    )
            else:
                LOGGER.debug(
                    'NVML get running process version 3 API with v3 type struct is available.',
                )

        return __get_running_processes_version_suffix

    def __nvml_device_get_running_processes(
        func: str,
        handle: c_nvmlDevice_t,
    ) -> list[c_nvmlProcessInfo_t]:
        """Helper function for :func:`nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`.

        Modified from function :func:`pynvml.nvmlDeviceGetComputeRunningProcesses` in package
        `nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_.
        """
        version_suffix = __determine_get_running_processes_version_suffix()

        # First call to get the size
        c_count = _ctypes.c_uint(0)
        # pylint: disable-next=protected-access
        fn = _pynvml._nvmlGetFunctionPointer(f'{func}{version_suffix}')
        ret = fn(handle, _ctypes.byref(c_count), None)

        if ret == NVML_SUCCESS:
            # Special case, no running processes
            return []
        if ret == NVML_ERROR_INSUFFICIENT_SIZE:
            # Typical case
            # Oversize the array in case more processes are created
            c_count.value = c_count.value * 2 + 5
            process_array = c_nvmlProcessInfo_t * c_count.value  # type: ignore[operator]
            c_processes = process_array()  # type: ignore[operator]

            # Make the call again
            ret = fn(handle, _ctypes.byref(c_count), c_processes)
            _pynvml._nvmlCheckReturn(ret)  # pylint: disable=protected-access

            processes = []
            for i in range(c_count.value):
                # Use an alternative struct for this object
                obj = _pynvml.nvmlStructToFriendlyObject(c_processes[i])
                if obj.usedGpuMemory == ULONGLONG_MAX:
                    # Special case for WDDM on Windows, see comment above
                    obj.usedGpuMemory = None
                if getattr(obj, 'usedGpuCcProtectedMemory', None) == ULONGLONG_MAX:
                    obj.usedGpuCcProtectedMemory = None
                processes.append(obj)

            return processes

        # Error case
        raise NVMLError(ret)

    def nvmlDeviceGetComputeRunningProcesses(  # pylint: disable=function-redefined
        handle: c_nvmlDevice_t,
    ) -> list[c_nvmlProcessInfo_t]:
        """Get information about processes with a compute context on a device.

        Note:
            - In MIG mode, if device handle is provided, the API returns aggregate information, only
              if the caller has appropriate privileges. Per-instance information can be queried by
              using specific MIG device handles.

        Raises:
            NVMLError_Uninitialized:
                If NVML was not first initialized with :func:`nvmlInit`.
            NVMLError_NoPermission:
                If the user doesn't have permission to perform this operation.
            NVMLError_InvalidArgument:
                If device is invalid.
            NVMLError_GpuIsLost:
                If the target GPU has fallen off the bus or is otherwise inaccessible.
            NVMLError_Unknown:
                On any unexpected error.
        """
        return __nvml_device_get_running_processes(
            'nvmlDeviceGetComputeRunningProcesses',
            handle,
        )

    def nvmlDeviceGetGraphicsRunningProcesses(  # pylint: disable=function-redefined
        handle: c_nvmlDevice_t,
    ) -> list[c_nvmlProcessInfo_t]:
        """Get information about processes with a graphics context on a device.

        Note:
            - In MIG mode, if device handle is provided, the API returns aggregate information, only
              if the caller has appropriate privileges. Per-instance information can be queried by
              using specific MIG device handles.

        Raises:
            NVMLError_Uninitialized:
                If NVML was not first initialized with :func:`nvmlInit`.
            NVMLError_NoPermission:
                If the user doesn't have permission to perform this operation.
            NVMLError_InvalidArgument:
                If device is invalid.
            NVMLError_GpuIsLost:
                If the target GPU has fallen off the bus or is otherwise inaccessible.
            NVMLError_Unknown:
                On any unexpected error.
        """
        return __nvml_device_get_running_processes(
            'nvmlDeviceGetGraphicsRunningProcesses',
            handle,
        )

    def nvmlDeviceGetMPSComputeRunningProcesses(  # pylint: disable=function-redefined
        handle: c_nvmlDevice_t,
    ) -> list[c_nvmlProcessInfo_t]:
        """Get information about processes with a MPS compute context on a device.

        Note:
            - In MIG mode, if device handle is provided, the API returns aggregate information, only
              if the caller has appropriate privileges. Per-instance information can be queried by
              using specific MIG device handles.

        Raises:
            NVMLError_Uninitialized:
                If NVML was not first initialized with :func:`nvmlInit`.
            NVMLError_NoPermission:
                If the user doesn't have permission to perform this operation.
            NVMLError_InvalidArgument:
                If device is invalid.
            NVMLError_GpuIsLost:
                If the target GPU has fallen off the bus or is otherwise inaccessible.
            NVMLError_Unknown:
                On any unexpected error.
        """
        return __nvml_device_get_running_processes(
            'nvmlDeviceGetMPSComputeRunningProcesses',
            handle,
        )

else:
    LOGGER.warning(
        'Your installed package `nvidia-ml-py` is corrupted. '
        'Skip patch functions `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`. '
        'You may get incorrect or incomplete results. Please consider reinstall package '
        '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
    )

# Patch function `nvmlDeviceGetMemoryInfo`
if not _pynvml_installation_corrupted:
    # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
    class c_nvmlMemory_v1_t(_pynvml._PrintableStructure):  # pylint: disable=protected-access
        _fields_: _ClassVar[list[tuple[str, type]]] = [
            # Total physical device memory (in bytes).
            ('total', _pynvml.c_ulonglong),
            # Unallocated device memory (in bytes).
            ('free', _pynvml.c_ulonglong),
            # Allocated device memory (in bytes).
            # Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.
            ('used', _pynvml.c_ulonglong),
        ]
        _fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}

    # pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
    class c_nvmlMemory_v2_t(_pynvml._PrintableStructure):  # pylint: disable=protected-access
        _fields_: _ClassVar[list[tuple[str, type]]] = [
            # Structure format version (must be 2).
            ('version', _pynvml.c_uint),
            # Total physical device memory (in bytes).
            ('total', _pynvml.c_ulonglong),
            # Device memory (in bytes) reserved for system use (driver or firmware).
            ('reserved', _pynvml.c_ulonglong),
            # Unallocated device memory (in bytes).
            ('free', _pynvml.c_ulonglong),
            # Allocated device memory (in bytes).
            # Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.
            ('used', _pynvml.c_ulonglong),
        ]
        _fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}

    nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | 2 << 24)
    __get_memory_info_version_suffix = None
    c_nvmlMemory_t = c_nvmlMemory_v2_t

    def __determine_get_memory_info_version_suffix() -> str:
        global __get_memory_info_version_suffix, c_nvmlMemory_t  # pylint: disable=global-statement

        if __get_memory_info_version_suffix is None:
            # pylint: disable-next=protected-access,no-member
            _nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
            __get_memory_info_version_suffix = '_v2'
            try:
                _nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
            except NVMLError_FunctionNotFound:
                c_nvmlMemory_t = c_nvmlMemory_v1_t
                __get_memory_info_version_suffix = ''
                LOGGER.debug(
                    'NVML get memory info version 2 API is not available due to incompatible '
                    'NVIDIA driver. Fallback to use NVML get memory info version 1 API.',
                )
            else:
                LOGGER.debug('NVML get memory info version 2 is available.')

        return __get_memory_info_version_suffix

    def nvmlDeviceGetMemoryInfo(  # pylint: disable=function-redefined
        handle: c_nvmlDevice_t,
    ) -> c_nvmlMemory_t:
        """Retrieve the amount of used, free, reserved and total memory available on the device, in bytes.

        Note:
            - The version 2 API adds additional memory information. The reserved amount is supported
              on version 2 only.
            - In MIG mode, if device handle is provided, the API returns aggregate information, only
              if the caller has appropriate privileges. Per-instance information can be queried by
              using specific MIG device handles.

        Raises:
            NVMLError_Uninitialized:
                If NVML was not first initialized with :func:`nvmlInit`.
            NVMLError_NoPermission:
                If the user doesn't have permission to perform this operation.
            NVMLError_InvalidArgument:
                If device is invalid.
            NVMLError_GpuIsLost:
                If the target GPU has fallen off the bus or is otherwise inaccessible.
            NVMLError_Unknown:
                On any unexpected error.
        """
        version_suffix = __determine_get_memory_info_version_suffix()
        if version_suffix == '_v2':
            c_memory = c_nvmlMemory_v2_t()
            c_memory.version = nvmlMemory_v2  # pylint: disable=attribute-defined-outside-init
            # pylint: disable-next=protected-access
            fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
        elif version_suffix in {'_v1', ''}:
            c_memory = c_nvmlMemory_v1_t()
            # pylint: disable-next=protected-access
            fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo')
        else:
            raise ValueError(
                f'Unknown version suffix {version_suffix!r} for '
                'function `nvmlDeviceGetMemoryInfo`.',
            )
        ret = fn(handle, _ctypes.byref(c_memory))
        _pynvml._nvmlCheckReturn(ret)  # pylint: disable=protected-access
        return c_memory

else:
    LOGGER.warning(
        'Your installed package `nvidia-ml-py` is corrupted. '
        'Skip patch functions `nvmlDeviceGetMemoryInfo`. '
        'You may get incorrect or incomplete results. Please consider reinstall package '
        '`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
    )


# Add support for lookup fallback and context manager ##############################################
class _CustomModule(_ModuleType):
    """Modified module type to support lookup fallback and context manager.

    Automatic lookup fallback:

        >>> libnvml.c_nvmlGpuInstance_t  # fallback to pynvml.c_nvmlGpuInstance_t
        <class 'pynvml.LP_struct_c_nvmlGpuInstance_t'>

    Context manager:

        >>> with libnvml:
        ...     handle = libnvml.nvmlDeviceGetHandleByIndex(0)
        ... # The NVML context has been shutdown
    """

    def __getattribute__(self, name: str) -> _Any | _Callable[..., _Any]:
        """Get a member from the current module. Fallback to the original package if missing."""
        try:
            return super().__getattribute__(name)
        except AttributeError:
            return getattr(_pynvml, name)

    def __enter__(self) -> _CustomModule:
        """Entry of the context manager for ``with`` statement."""
        _lazy_init()
        return self

    def __exit__(self, *args: _Any, **kwargs: _Any) -> None:
        """Shutdown the NVML context in the context manager for ``with`` statement."""
        try:
            nvmlShutdown()
        except NVMLError:
            pass


# Replace entry in sys.modules for this module with an instance of _CustomModule
__modself = _sys.modules[__name__]
__modself.__class__ = _CustomModule