mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 06:06:12 -06:00
feat(exporter): add dashboard example (#138)
This commit is contained in:
parent
3461ad9650
commit
1f76ccca37
6 changed files with 2676 additions and 11 deletions
|
|
@ -14,7 +14,7 @@ insert_final_newline = true
|
|||
indent_size = 4
|
||||
src_paths=nvitop
|
||||
|
||||
[*.{yaml,yml}]
|
||||
[*.{yaml,yml,json,xml}]
|
||||
indent_size = 2
|
||||
|
||||
[*.md]
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
### Added
|
||||
|
||||
- Add Grafana dashboard for `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#138](https://github.com/XuehaiPan/nvitop/pull/138).
|
||||
- Handle exceptions for function `getpass.getuser()` by [@XuehaiPan](https://github.com/XuehaiPan) in [#130](https://github.com/XuehaiPan/nvitop/pull/130). Issued by [@landgraf](https://github.com/landgraf).
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
|
|
@ -20,6 +20,15 @@ An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for G
|
|||
(TERM: GNOME Terminal / OS: Ubuntu 16.04 LTS (over SSH) / Locale: <code>en_US.UTF-8</code>)
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="./nvitop-exporter">
|
||||
<img width="100%" src="https://github.com/user-attachments/assets/c1769a8b-2d06-47c4-8f76-c91dace132e9" alt="Filter">
|
||||
</a>
|
||||
<br/>
|
||||
A Grafana dashboard built on top of <code>nvitop-exporter</code>.
|
||||
</p>
|
||||
|
||||
|
||||
### Table of Contents <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
|
||||
|
||||
- [Features](#features)
|
||||
|
|
|
|||
|
|
@ -2,10 +2,35 @@
|
|||
|
||||
Prometheus exporter built on top of `nvitop`.
|
||||
|
||||
## Installation
|
||||
## Quickstart
|
||||
|
||||
Install from PyPI:
|
||||
Start the exporter with the following command:
|
||||
|
||||
```bash
|
||||
pip3 install --upgrade nvitop-exporter
|
||||
pipx run nvitop-exporter --bind-address 0.0.0.0 --port 5050
|
||||
# or
|
||||
uvx nvitop-exporter --bind-address 0.0.0.0 --port 5050
|
||||
```
|
||||
|
||||
Then you can access the metrics at `http://localhost:5050/metrics`.
|
||||
|
||||
You will need to configure Prometheus to scrape the metrics from the exporter.
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'nvitop-exporter'
|
||||
static_configs:
|
||||
- targets: ['localhost:5050']
|
||||
```
|
||||
|
||||
## Grafana Dashboard
|
||||
|
||||
A Grafana dashboard is provided to visualize the metrics collected by the exporter.
|
||||
The source of the dashboard is [`dashboard.json`](./dashboard.json).
|
||||
The Grafana dashboard can also be imported as by ID [22589](https://grafana.com/grafana/dashboards/22589-nvitop-dashboard).
|
||||
|
||||
<p align="center">
|
||||
<img width="100%" src="https://github.com/user-attachments/assets/c1769a8b-2d06-47c4-8f76-c91dace132e9" alt="Filter">
|
||||
<br/>
|
||||
The Grafana dashboard for the exporter.
|
||||
</p>
|
||||
|
|
|
|||
2602
nvitop-exporter/dashboard.json
Normal file
2602
nvitop-exporter/dashboard.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -405,6 +405,12 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
|
|||
)
|
||||
|
||||
# Create gauges for process metrics
|
||||
self.process_info = Info(
|
||||
name='process_info',
|
||||
documentation='Process information.',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_running_time = Gauge(
|
||||
name='process_running_time',
|
||||
documentation='Process running time (s).',
|
||||
|
|
@ -592,19 +598,40 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
|
|||
alive_pids.clear()
|
||||
|
||||
with GpuProcess.failsafe():
|
||||
host_snapshots = {}
|
||||
for pid, process in device.processes().items():
|
||||
with process.oneshot():
|
||||
username = process.username()
|
||||
running_time = process.running_time()
|
||||
alive_pids.add((pid, username))
|
||||
if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100
|
||||
host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot()
|
||||
else:
|
||||
host_snapshot = host_snapshots[(pid, username)]
|
||||
self.process_info.labels(
|
||||
hostname=self.hostname,
|
||||
index=index,
|
||||
devicename=name,
|
||||
uuid=uuid,
|
||||
pid=pid,
|
||||
username=username,
|
||||
).info(
|
||||
{
|
||||
'status': host_snapshot.status,
|
||||
'command': host_snapshot.command,
|
||||
},
|
||||
)
|
||||
for gauge, value in (
|
||||
(
|
||||
self.process_running_time,
|
||||
running_time.total_seconds() if running_time else math.nan,
|
||||
(
|
||||
host_snapshot.running_time.total_seconds()
|
||||
if host_snapshot.running_time
|
||||
else math.nan
|
||||
),
|
||||
),
|
||||
(self.process_cpu_percent, process.cpu_percent()),
|
||||
(self.process_rss_memory, process.host_memory() / MiB),
|
||||
(self.process_memory_percent, float(process.memory_percent())),
|
||||
(self.process_cpu_percent, host_snapshot.cpu_percent),
|
||||
(self.process_rss_memory, host_snapshot.host_memory / MiB),
|
||||
(self.process_memory_percent, float(host_snapshot.memory_percent)),
|
||||
(self.process_gpu_memory, process.gpu_memory() / MiB),
|
||||
(
|
||||
self.process_gpu_sm_utilization,
|
||||
|
|
@ -633,7 +660,8 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
|
|||
).set(value)
|
||||
|
||||
for pid, username in previous_alive_pids.difference(alive_pids):
|
||||
for gauge in (
|
||||
for collector in (
|
||||
self.process_info,
|
||||
self.process_running_time,
|
||||
self.process_cpu_percent,
|
||||
self.process_rss_memory,
|
||||
|
|
@ -645,7 +673,7 @@ class PrometheusExporter: # pylint: disable=too-many-instance-attributes
|
|||
self.process_gpu_decoder_utilization,
|
||||
):
|
||||
try:
|
||||
gauge.remove(
|
||||
collector.remove(
|
||||
self.hostname,
|
||||
index,
|
||||
name,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue