feat(exporter): support TLS and mutual TLS for the metrics endpoint (#213)

This commit is contained in:
Xuehai Pan 2026-05-06 17:26:32 +08:00 committed by GitHub
parent 1bed33ed1e
commit 4e814c52a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 147 additions and 7 deletions

View file

@ -15,10 +15,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `nvidia-ml-py` 13.595.45 to support list.
- Add support for open kernel-module driver packages (e.g., `nvidia-driver-595-open`) in `install-nvidia-driver.sh` with new `--proprietary` and `--open` flags by [@XuehaiPan](https://github.com/XuehaiPan).
- Add TLS and mutual TLS (mTLS) support for `nvitop-exporter` via new `--certfile`, `--keyfile`, `--client-cafile`, `--client-capath`, and `--client-auth-required` CLI flags by [@XuehaiPan](https://github.com/XuehaiPan) in [#213](https://github.com/XuehaiPan/nvitop/pull/213). Issued by [@StefanSander3](https://github.com/StefanSander3) in [#131](https://github.com/XuehaiPan/nvitop/issues/131).
### Changed
-
- Bump minimum `prometheus-client` version to `0.19.0` for `nvitop-exporter` (required for TLS support) by [@XuehaiPan](https://github.com/XuehaiPan) in [#213](https://github.com/XuehaiPan/nvitop/pull/213).
### Fixed

View file

@ -23,6 +23,64 @@ scrape_configs:
- targets: ['localhost:5050']
```
## TLS / mTLS
The exporter can serve metrics over HTTPS, optionally requiring client certificate authentication (mTLS). TLS support is provided by `prometheus_client` (>= 0.19.0) and configured entirely through CLI flags — no config file is involved.
### Plain HTTPS
Provide a server certificate and private key:
```bash
nvitop-exporter --bind-address 0.0.0.0 --port 5050 \
--certfile /path/to/server.crt \
--keyfile /path/to/server.key
```
The metrics endpoint is then served at [`https://localhost:5050/metrics`](https://localhost:5050/metrics). Update the Prometheus scrape config to use the `https` scheme, and point it at the CA that signed your server certificate:
```yaml
scrape_configs:
- job_name: 'nvitop-exporter'
scheme: https
static_configs:
- targets: ['localhost:5050']
tls_config:
ca_file: /path/to/server-ca.crt
```
### Mutual TLS (mTLS)
To require scrapers to present a valid client certificate, pass a CA bundle (`--client-cafile`) or CA directory (`--client-capath`) **and** `--client-auth-required`:
```bash
nvitop-exporter --bind-address 0.0.0.0 --port 5050 \
--certfile /path/to/server.crt \
--keyfile /path/to/server.key \
--client-cafile /path/to/clients-ca.crt \
--client-auth-required
```
`--client-cafile` / `--client-capath` and `--client-auth-required` must be specified together. Passing a CA without `--client-auth-required` is rejected by the CLI to avoid the silent "trust but don't verify" configuration that the underlying `prometheus_client` API would otherwise allow.
Configure Prometheus to present its client certificate when scraping:
```yaml
scrape_configs:
- job_name: 'nvitop-exporter'
scheme: https
static_configs:
- targets: ['localhost:5050']
tls_config:
ca_file: /path/to/server-ca.crt
cert_file: /path/to/prometheus-client.crt
key_file: /path/to/prometheus-client.key
```
### Authentication beyond mTLS
The exporter does not implement HTTP basic auth, OAuth, or IP allowlisting. Following the standard Prometheus exporter pattern, run the exporter behind a reverse proxy (`NGINX`, `Traefik`, `Caddy`, ...) if any of those are required.
## Grafana Dashboard
A Grafana dashboard is provided to visualize the metrics collected by the exporter.

View file

@ -19,6 +19,7 @@
from __future__ import annotations
import argparse
import os
import sys
from typing import TextIO
@ -118,6 +119,58 @@ def parse_arguments() -> argparse.Namespace:
help='Interval between updates in seconds. (default: %(default)s)',
)
tls_group = parser.add_argument_group('TLS / mTLS options')
tls_group.add_argument(
'--certfile',
dest='certfile',
type=str,
default=None,
metavar='PATH',
help=(
'Path to the TLS certificate file (PEM).\n'
'Enables HTTPS when set together with `--keyfile`.'
),
)
tls_group.add_argument(
'--keyfile',
dest='keyfile',
type=str,
default=None,
metavar='PATH',
help='Path to the TLS private key file (PEM).\nRequired if `--certfile` is set.',
)
tls_group.add_argument(
'--client-cafile',
dest='client_cafile',
type=str,
default=None,
metavar='PATH',
help=(
'Path to a PEM bundle of trusted client CA certificates for mutual TLS.\n'
'Requires `--client-auth-required` to actually verify client certificates.'
),
)
tls_group.add_argument(
'--client-capath',
dest='client_capath',
type=str,
default=None,
metavar='PATH',
help=(
'Path to a directory of trusted client CA certificates for mutual TLS.\n'
'Requires `--client-auth-required` to actually verify client certificates.'
),
)
tls_group.add_argument(
'--client-auth-required',
dest='client_auth_required',
action='store_true',
help=(
'Require clients to present a valid certificate (mutual TLS).\n'
'Requires `--client-cafile` or `--client-capath`.'
),
)
args = parser.parse_args()
if args.interval < 0.25:
parser.error(
@ -125,12 +178,32 @@ def parse_arguments() -> argparse.Namespace:
f'Expected 1/4 or higher.',
)
if (args.certfile is None) != (args.keyfile is None):
parser.error('`--certfile` and `--keyfile` must be specified together.')
if args.certfile is not None and not os.path.isfile(args.certfile):
parser.error(f'`--certfile` not found: {args.certfile}')
if args.keyfile is not None and not os.path.isfile(args.keyfile):
parser.error(f'`--keyfile` not found: {args.keyfile}')
if args.client_cafile is not None and not os.path.isfile(args.client_cafile):
parser.error(f'`--client-cafile` not found: {args.client_cafile}')
if args.client_capath is not None and not os.path.isdir(args.client_capath):
parser.error(f'`--client-capath` not a directory: {args.client_capath}')
ca_provided = args.client_cafile is not None or args.client_capath is not None
if (ca_provided or args.client_auth_required) and args.certfile is None:
parser.error('Mutual TLS options require `--certfile` and `--keyfile`.')
if ca_provided != args.client_auth_required:
parser.error(
'`--client-cafile` / `--client-capath` and `--client-auth-required` must be '
'specified together to enable mutual TLS.',
)
return args
def main() -> int: # pylint: disable=too-many-locals,too-many-statements
"""Main function for ``nvitop-exporter`` CLI."""
args = parse_arguments()
scheme = 'https' if args.certfile is not None else 'http'
try:
device_count = Device.count()
@ -181,7 +254,15 @@ def main() -> int: # pylint: disable=too-many-locals,too-many-statements
exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval)
try:
start_wsgi_server(port=args.port, addr=args.bind_address)
start_wsgi_server(
port=args.port,
addr=args.bind_address,
certfile=args.certfile,
keyfile=args.keyfile,
client_cafile=args.client_cafile,
client_capath=args.client_capath,
client_auth_required=args.client_auth_required,
)
except OSError as ex:
if 'address already in use' in str(ex).lower():
cprint(
@ -190,7 +271,7 @@ def main() -> int: # pylint: disable=too-many-locals,too-many-statements
'Please specify a different port via `--port <PORT>`.'
).format(
colored(
f'http://{args.bind_address}:{args.port}',
f'{scheme}://{args.bind_address}:{args.port}',
color='blue',
attrs=('bold', 'underline'),
),
@ -204,7 +285,7 @@ def main() -> int: # pylint: disable=too-many-locals,too-many-statements
'Please specify a different address via `--bind-address <ADDRESS>`.'
).format(
colored(
f'http://{args.bind_address}:{args.port}',
f'{scheme}://{args.bind_address}:{args.port}',
color='blue',
attrs=('bold', 'underline'),
),
@ -219,7 +300,7 @@ def main() -> int: # pylint: disable=too-many-locals,too-many-statements
'INFO: Start the exporter on {} at {}.'.format(
colored(args.hostname, color='magenta', attrs=('bold',)),
colored(
f'http://{args.bind_address}:{args.port}/metrics',
f'{scheme}://{args.bind_address}:{args.port}/metrics',
color='green',
attrs=('bold', 'underline'),
),

View file

@ -49,7 +49,7 @@ classifiers = [
dependencies = [
# Sync with nvitop/version.py and requirements.txt
"nvitop ~= 1.6.2",
"prometheus-client >= 0.4.0",
"prometheus-client >= 0.19.0",
]
dynamic = ["version"]

View file

@ -1,2 +1,2 @@
nvitop
prometheus-client >= 0.4.0
prometheus-client >= 0.19.0