feat(api/device): support devices with unified memory (#195)

2026-05-15 14:15:55 -06:00 · 2025-12-08 11:41:35 +08:00 · 2025-12-08 11:41:35 +08:00 · 4d948e2c9d
commit 4d948e2c9d
parent 506dba0b49
6 changed files with 97 additions and 25 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -29,7 +29,7 @@ repos:
        args: [--ignore-case]
        files: ^docs/source/spelling_wordlist\.txt$
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.6
+    rev: v0.14.8
    hooks:
      - id: ruff-check
        args: [--fix, --exit-non-zero-on-fix]
@ -41,7 +41,7 @@ repos:
      - id: codespell
        additional_dependencies: [".[toml]"]
  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.18.2
+    rev: v1.19.0
    hooks:
      - id: mypy
        exclude: |
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

-
+- Support devices with unified memory (e.g., NVIDIA Spark) by [@XuehaiPan](https://github.com/XuehaiPan) in [#195](https://github.com/XuehaiPan/nvitop/pull/195). Issued by [@FlorinAndrei](https://github.com/FlorinAndrei).

 ### Changed

--- a/nvitop/api/device.py
+++ b/nvitop/api/device.py
@ -117,7 +117,7 @@ import time
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, overload

-from nvitop.api import libcuda, libcudart, libnvml
+from nvitop.api import host, libcuda, libcudart, libnvml
 from nvitop.api.process import GpuProcess
 from nvitop.api.utils import (
    NA,
@ -148,32 +148,74 @@ __all__ = [
 # Class definitions ################################################################################


-class MemoryInfo(NamedTuple):  # in bytes # pylint: disable=missing-class-docstring
+class MemoryInfo(NamedTuple):  # in bytes
+    """Device memory information in bytes.
+
+    Attributes:
+        total: Total device memory.
+        free: Unallocated device memory.
+        used: Allocated device memory.
+        reserved: Memory reserved for system use (default: NA).
+    """
+
    total: int | NaType
    free: int | NaType
    used: int | NaType
+    reserved: int | NaType = NA


-class ClockInfos(NamedTuple):  # in MHz # pylint: disable=missing-class-docstring
+class ClockInfos(NamedTuple):  # in MHz
+    """Clock speeds information in MHz.
+
+    Attributes:
+        graphics: Graphics clock speed.
+        sm: SM (streaming multiprocessor) clock speed.
+        memory: Memory clock speed.
+        video: Video encoder/decoder clock speed.
+    """
+
    graphics: int | NaType
    sm: int | NaType
    memory: int | NaType
    video: int | NaType


-class ClockSpeedInfos(NamedTuple):  # pylint: disable=missing-class-docstring
+class ClockSpeedInfos(NamedTuple):
+    """Clock speeds information in MHz.
+
+    Attributes:
+        current: Current clock speeds.
+        max: Maximum clock speeds.
+    """
+
    current: ClockInfos
    max: ClockInfos


-class UtilizationRates(NamedTuple):  # in percentage # pylint: disable=missing-class-docstring
+class UtilizationRates(NamedTuple):  # in percentage
+    """Utilization rates in percentage.
+
+    Attributes:
+        gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
+        memory: Percent of time over the past sample period during which global (device) memory was being read or written.
+        encoder: Video encoder utilization rate.
+        decoder: Video decoder utilization rate.
+    """  # pylint: disable=line-too-long
+
    gpu: int | NaType
    memory: int | NaType
    encoder: int | NaType
    decoder: int | NaType


-class ThroughputInfo(NamedTuple):  # in KiB/s # pylint: disable=missing-class-docstring
+class ThroughputInfo(NamedTuple):  # in KiB/s
+    """Throughput information in KiB/s.
+
+    Attributes:
+        tx: Transmit throughput in KiB/s.
+        rx: Receive throughput in KiB/s.
+    """
+
    tx: int | NaType
    rx: int | NaType

@ -925,18 +967,37 @@ class Device:  # pylint: disable=too-many-instance-attributes,too-many-public-me
    def memory_info(self) -> MemoryInfo:  # in bytes
        """Return a named tuple with memory information (in bytes) for the device.

-        Returns: MemoryInfo(total, free, used)
+        Returns: MemoryInfo(total, free, used, reserved)
            A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
        """
        if self._handle is not None:
-            memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self._handle)
-            if libnvml.nvmlCheckReturn(memory_info):
-                return MemoryInfo(
-                    total=memory_info.total,
-                    free=memory_info.free,
-                    used=memory_info.used,
+            has_unified_memory = False
+            try:
+                memory_info = libnvml.nvmlQuery(
+                    'nvmlDeviceGetMemoryInfo',
+                    self._handle,
+                    ignore_errors=False,
                )
-        return MemoryInfo(total=NA, free=NA, used=NA)
+            except libnvml.NVMLError_NotSupported:
+                has_unified_memory = True
+                memory_info = NA
+            except libnvml.NVMLError:
+                memory_info = NA
+            if libnvml.nvmlCheckReturn(memory_info):
+                if memory_info.total > 0:
+                    return MemoryInfo(
+                        total=memory_info.total,
+                        free=memory_info.free,
+                        used=memory_info.used,
+                        reserved=getattr(memory_info, 'reserved', NA),
+                    )
+                has_unified_memory = True
+            if has_unified_memory:
+                # Device with unified memory
+                # Use system virtual memory as these devices share host memory
+                vm = host.virtual_memory()
+                return MemoryInfo(total=vm.total, free=vm.free, used=vm.used, reserved=NA)
+        return MemoryInfo(total=NA, free=NA, used=NA, reserved=NA)

    def memory_total(self) -> int | NaType:  # in bytes
        """Total installed GPU memory in bytes.
@ -1014,8 +1075,8 @@ class Device:  # pylint: disable=too-many-instance-attributes,too-many-public-me
        Returns: Union[float, NaType]
            The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable.
        """
-        total, _, used = self.memory_info()
-        if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int):
+        total, _, used, _ = self.memory_info()
+        if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int) and total > 0:
            return round(100.0 * used / total, 1)
        return NA

@ -1098,8 +1159,8 @@ class Device:  # pylint: disable=too-many-instance-attributes,too-many-public-me
        Returns: Union[float, NaType]
            The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable.
        """  # pylint: disable=line-too-long
-        total, _, used = self.bar1_memory_info()
-        if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int):
+        total, _, used, _ = self.bar1_memory_info()
+        if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int) and total > 0:
            return round(100.0 * used / total, 1)
        return NA

--- a/nvitop/api/process.py
+++ b/nvitop/api/process.py
@ -639,7 +639,11 @@ class GpuProcess:  # pylint: disable=too-many-instance-attributes,too-many-publi
        self._gpu_memory_human = bytes2human(self.gpu_memory())
        memory_total = self.device.memory_total()
        gpu_memory_percent = NA
-        if libnvml.nvmlCheckReturn(memory_used, int) and libnvml.nvmlCheckReturn(memory_total, int):
+        if (
+            libnvml.nvmlCheckReturn(memory_used, int)
+            and libnvml.nvmlCheckReturn(memory_total, int)
+            and memory_total > 0
+        ):
            gpu_memory_percent = round(100.0 * memory_used / memory_total, 1)  # type: ignore[assignment]
        self._gpu_memory_percent = gpu_memory_percent

--- a/nvitop/tui/library/utils.py
+++ b/nvitop/tui/library/utils.py
@ -125,8 +125,15 @@ def make_bar_chart(
        else:
            text = f'{min(round(percent), 100):d}%'.replace('100%', 'MAX')  # type: ignore[arg-type]
    else:
-        bar_chart += '░' * (width - len(bar_chart) - 4)
        text = 'N/A'
+        if (
+            extra_text
+            and 'N/A' not in extra_text.upper()
+            and swap_text
+            and len(bar_chart) + len(extra_text) + 2 <= width
+        ):
+            text, extra_text = extra_text, ''
+        bar_chart += '░' * (width - len(bar_chart) - len(text) - 1)
    if extra_text:
        if len(f'{bar_chart} {text} {extra_blank}{extra_text}') <= width:
            if swap_text:
--- a/nvitop/tui/screens/main/panels/device.py
+++ b/nvitop/tui/screens/main/panels/device.py
@ -486,7 +486,7 @@ class DevicePanel(BasePanel):  # pylint: disable=too-many-instance-attributes
                            prefix,
                            utilization,
                            width=width,
-                            extra_text=extra_text,
+                            extra_text=extra_text if 'N/A' not in extra_text else '',
                            swap_text=not extra_text.endswith('MHz'),
                            extra_blank='  ',
                        )
@ -665,7 +665,7 @@ class DevicePanel(BasePanel):  # pylint: disable=too-many-instance-attributes
                                prefix,
                                utilization,
                                width=width,
-                                extra_text=extra_text,
+                                extra_text=extra_text if 'N/A' not in extra_text else '',
                                swap_text=not extra_text.endswith('MHz'),
                                extra_blank='  ',
                            )