feat(exporter): add dashboard example (#138)

This commit is contained in:
Xuehai Pan 2024-12-29 21:38:28 +08:00
parent 3461ad9650
commit 1f76ccca37
6 changed files with 2676 additions and 11 deletions

View file

@ -14,7 +14,7 @@ insert_final_newline = true
indent_size = 4
src_paths=nvitop
[*.{yaml,yml}]
[*.{yaml,yml,json,xml}]
indent_size = 2
[*.md]

View file

@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Add Grafana dashboard for `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#138](https://github.com/XuehaiPan/nvitop/pull/138).
- Handle exceptions for function `getpass.getuser()` by [@XuehaiPan](https://github.com/XuehaiPan) in [#130](https://github.com/XuehaiPan/nvitop/pull/130). Issued by [@landgraf](https://github.com/landgraf).
### Changed

View file

@ -20,6 +20,15 @@ An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for G
(TERM: GNOME Terminal / OS: Ubuntu 16.04 LTS (over SSH) / Locale: <code>en_US.UTF-8</code>)
</p>
<p align="center">
<a href="./nvitop-exporter">
<img width="100%" src="https://github.com/user-attachments/assets/c1769a8b-2d06-47c4-8f76-c91dace132e9" alt="Filter">
</a>
<br/>
A Grafana dashboard built on top of <code>nvitop-exporter</code>.
</p>
### Table of Contents <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
- [Features](#features)

View file

@ -2,10 +2,35 @@
Prometheus exporter built on top of `nvitop`.
## Installation
## Quickstart
Install from PyPI:
Start the exporter with the following command:
```bash
pip3 install --upgrade nvitop-exporter
pipx run nvitop-exporter --bind-address 0.0.0.0 --port 5050
# or
uvx nvitop-exporter --bind-address 0.0.0.0 --port 5050
```
Then you can access the metrics at `http://localhost:5050/metrics`.
You will need to configure Prometheus to scrape the metrics from the exporter.
```yaml
scrape_configs:
- job_name: 'nvitop-exporter'
static_configs:
- targets: ['localhost:5050']
```
## Grafana Dashboard
A Grafana dashboard is provided to visualize the metrics collected by the exporter.
The source of the dashboard is [`dashboard.json`](./dashboard.json).
The Grafana dashboard can also be imported as by ID [22589](https://grafana.com/grafana/dashboards/22589-nvitop-dashboard).
<p align="center">
<img width="100%" src="https://github.com/user-attachments/assets/c1769a8b-2d06-47c4-8f76-c91dace132e9" alt="Filter">
<br/>
The Grafana dashboard for the exporter.
</p>

File diff suppressed because it is too large Load diff

View file

@ -405,6 +405,12 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
)
# Create gauges for process metrics
self.process_info = Info(
name='process_info',
documentation='Process information.',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_running_time = Gauge(
name='process_running_time',
documentation='Process running time (s).',
@ -592,19 +598,40 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
alive_pids.clear()
with GpuProcess.failsafe():
host_snapshots = {}
for pid, process in device.processes().items():
with process.oneshot():
username = process.username()
running_time = process.running_time()
alive_pids.add((pid, username))
if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100
host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot()
else:
host_snapshot = host_snapshots[(pid, username)]
self.process_info.labels(
hostname=self.hostname,
index=index,
devicename=name,
uuid=uuid,
pid=pid,
username=username,
).info(
{
'status': host_snapshot.status,
'command': host_snapshot.command,
},
)
for gauge, value in (
(
self.process_running_time,
running_time.total_seconds() if running_time else math.nan,
(
host_snapshot.running_time.total_seconds()
if host_snapshot.running_time
else math.nan
),
),
(self.process_cpu_percent, process.cpu_percent()),
(self.process_rss_memory, process.host_memory() / MiB),
(self.process_memory_percent, float(process.memory_percent())),
(self.process_cpu_percent, host_snapshot.cpu_percent),
(self.process_rss_memory, host_snapshot.host_memory / MiB),
(self.process_memory_percent, float(host_snapshot.memory_percent)),
(self.process_gpu_memory, process.gpu_memory() / MiB),
(
self.process_gpu_sm_utilization,
@ -633,7 +660,8 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
).set(value)
for pid, username in previous_alive_pids.difference(alive_pids):
for gauge in (
for collector in (
self.process_info,
self.process_running_time,
self.process_cpu_percent,
self.process_rss_memory,
@ -645,7 +673,7 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
self.process_gpu_decoder_utilization,
):
try:
gauge.remove(
collector.remove(
self.hostname,
index,
name,