feat(api/device): support devices with unified memory (#195)

This commit is contained in:
Xuehai Pan 2025-12-08 11:41:35 +08:00 committed by GitHub
parent 506dba0b49
commit 4d948e2c9d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 97 additions and 25 deletions

View file

@ -29,7 +29,7 @@ repos:
args: [--ignore-case]
files: ^docs/source/spelling_wordlist\.txt$
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.6
rev: v0.14.8
hooks:
- id: ruff-check
args: [--fix, --exit-non-zero-on-fix]
@ -41,7 +41,7 @@ repos:
- id: codespell
additional_dependencies: [".[toml]"]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.18.2
rev: v1.19.0
hooks:
- id: mypy
exclude: |

View file

@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
-
- Support devices with unified memory (e.g., NVIDIA Spark) by [@XuehaiPan](https://github.com/XuehaiPan) in [#195](https://github.com/XuehaiPan/nvitop/pull/195). Issued by [@FlorinAndrei](https://github.com/FlorinAndrei).
### Changed

View file

@ -117,7 +117,7 @@ import time
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, overload
from nvitop.api import libcuda, libcudart, libnvml
from nvitop.api import host, libcuda, libcudart, libnvml
from nvitop.api.process import GpuProcess
from nvitop.api.utils import (
NA,
@ -148,32 +148,74 @@ __all__ = [
# Class definitions ################################################################################
class MemoryInfo(NamedTuple): # in bytes # pylint: disable=missing-class-docstring
class MemoryInfo(NamedTuple): # in bytes
"""Device memory information in bytes.
Attributes:
total: Total device memory.
free: Unallocated device memory.
used: Allocated device memory.
reserved: Memory reserved for system use (default: NA).
"""
total: int | NaType
free: int | NaType
used: int | NaType
reserved: int | NaType = NA
class ClockInfos(NamedTuple): # in MHz # pylint: disable=missing-class-docstring
class ClockInfos(NamedTuple): # in MHz
"""Clock speeds information in MHz.
Attributes:
graphics: Graphics clock speed.
sm: SM (streaming multiprocessor) clock speed.
memory: Memory clock speed.
video: Video encoder/decoder clock speed.
"""
graphics: int | NaType
sm: int | NaType
memory: int | NaType
video: int | NaType
class ClockSpeedInfos(NamedTuple): # pylint: disable=missing-class-docstring
class ClockSpeedInfos(NamedTuple):
"""Clock speeds information in MHz.
Attributes:
current: Current clock speeds.
max: Maximum clock speeds.
"""
current: ClockInfos
max: ClockInfos
class UtilizationRates(NamedTuple): # in percentage # pylint: disable=missing-class-docstring
class UtilizationRates(NamedTuple): # in percentage
"""Utilization rates in percentage.
Attributes:
gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
memory: Percent of time over the past sample period during which global (device) memory was being read or written.
encoder: Video encoder utilization rate.
decoder: Video decoder utilization rate.
""" # pylint: disable=line-too-long
gpu: int | NaType
memory: int | NaType
encoder: int | NaType
decoder: int | NaType
class ThroughputInfo(NamedTuple): # in KiB/s # pylint: disable=missing-class-docstring
class ThroughputInfo(NamedTuple): # in KiB/s
"""Throughput information in KiB/s.
Attributes:
tx: Transmit throughput in KiB/s.
rx: Receive throughput in KiB/s.
"""
tx: int | NaType
rx: int | NaType
@ -925,18 +967,37 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
def memory_info(self) -> MemoryInfo: # in bytes
"""Return a named tuple with memory information (in bytes) for the device.
Returns: MemoryInfo(total, free, used)
Returns: MemoryInfo(total, free, used, reserved)
A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable.
"""
if self._handle is not None:
memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self._handle)
if libnvml.nvmlCheckReturn(memory_info):
return MemoryInfo(
total=memory_info.total,
free=memory_info.free,
used=memory_info.used,
has_unified_memory = False
try:
memory_info = libnvml.nvmlQuery(
'nvmlDeviceGetMemoryInfo',
self._handle,
ignore_errors=False,
)
return MemoryInfo(total=NA, free=NA, used=NA)
except libnvml.NVMLError_NotSupported:
has_unified_memory = True
memory_info = NA
except libnvml.NVMLError:
memory_info = NA
if libnvml.nvmlCheckReturn(memory_info):
if memory_info.total > 0:
return MemoryInfo(
total=memory_info.total,
free=memory_info.free,
used=memory_info.used,
reserved=getattr(memory_info, 'reserved', NA),
)
has_unified_memory = True
if has_unified_memory:
# Device with unified memory
# Use system virtual memory as these devices share host memory
vm = host.virtual_memory()
return MemoryInfo(total=vm.total, free=vm.free, used=vm.used, reserved=NA)
return MemoryInfo(total=NA, free=NA, used=NA, reserved=NA)
def memory_total(self) -> int | NaType: # in bytes
"""Total installed GPU memory in bytes.
@ -1014,8 +1075,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: Union[float, NaType]
The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable.
"""
total, _, used = self.memory_info()
if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int):
total, _, used, _ = self.memory_info()
if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int) and total > 0:
return round(100.0 * used / total, 1)
return NA
@ -1098,8 +1159,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
Returns: Union[float, NaType]
The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable.
""" # pylint: disable=line-too-long
total, _, used = self.bar1_memory_info()
if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int):
total, _, used, _ = self.bar1_memory_info()
if libnvml.nvmlCheckReturn(used, int) and libnvml.nvmlCheckReturn(total, int) and total > 0:
return round(100.0 * used / total, 1)
return NA

View file

@ -639,7 +639,11 @@ class GpuProcess: # pylint: disable=too-many-instance-attributes,too-many-publi
self._gpu_memory_human = bytes2human(self.gpu_memory())
memory_total = self.device.memory_total()
gpu_memory_percent = NA
if libnvml.nvmlCheckReturn(memory_used, int) and libnvml.nvmlCheckReturn(memory_total, int):
if (
libnvml.nvmlCheckReturn(memory_used, int)
and libnvml.nvmlCheckReturn(memory_total, int)
and memory_total > 0
):
gpu_memory_percent = round(100.0 * memory_used / memory_total, 1) # type: ignore[assignment]
self._gpu_memory_percent = gpu_memory_percent

View file

@ -125,8 +125,15 @@ def make_bar_chart(
else:
text = f'{min(round(percent), 100):d}%'.replace('100%', 'MAX') # type: ignore[arg-type]
else:
bar_chart += '' * (width - len(bar_chart) - 4)
text = 'N/A'
if (
extra_text
and 'N/A' not in extra_text.upper()
and swap_text
and len(bar_chart) + len(extra_text) + 2 <= width
):
text, extra_text = extra_text, ''
bar_chart += '' * (width - len(bar_chart) - len(text) - 1)
if extra_text:
if len(f'{bar_chart} {text} {extra_blank}{extra_text}') <= width:
if swap_text:

View file

@ -486,7 +486,7 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
prefix,
utilization,
width=width,
extra_text=extra_text,
extra_text=extra_text if 'N/A' not in extra_text else '',
swap_text=not extra_text.endswith('MHz'),
extra_blank=' ',
)
@ -665,7 +665,7 @@ class DevicePanel(BasePanel): # pylint: disable=too-many-instance-attributes
prefix,
utilization,
width=width,
extra_text=extra_text,
extra_text=extra_text if 'N/A' not in extra_text else '',
swap_text=not extra_text.endswith('MHz'),
extra_blank=' ',
)