feat(exporter): add Prometheus exporter (#92)

This commit is contained in:
Xuehai Pan 2023-08-27 01:37:04 +08:00 committed by GitHub
parent 9ff3ec3400
commit daf72c7bf3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
24 changed files with 1475 additions and 37 deletions

View file

@ -72,15 +72,22 @@ jobs:
python -m venv venv &&
(
source venv/bin/activate &&
python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions
python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions &&
python -m pip install -r requirements.txt &&
python -m pip install -r nvitop-exporter/requirements.txt &&
python -m pre_commit install --install-hooks &&
python -m pre_commit run --all-files &&
python -c 'import nvitop' &&
python -m nvitop --version &&
python -m nvitop --help &&
python -m nvitop.select --version &&
python -m nvitop.select --help
python -m nvitop.select --help &&
(
cd nvitop-exporter &&
python -c 'import nvitop_exporter' &&
python -m nvitop_exporter --version &&
python -m nvitop_exporter --help
)
)
- name: Test docker build
@ -92,12 +99,17 @@ jobs:
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
run: |
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py
- name: Print version
run: python setup.py --version
run: |
python setup.py --version
python nvitop-exporter/setup.py --version
- name: Build sdist and wheels
run: python -m build
run: |
python -m build --outdir dist .
python -m build --outdir dist nvitop-exporter
- name: List built sdist and wheels
run: ls -lh dist/
@ -135,15 +147,23 @@ jobs:
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
run: |
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py
- name: Print version
run: python setup.py --version
run: |
python setup.py --version
python nvitop-exporter/setup.py --version
- name: Check consistency between the package version and release tag
if: startsWith(github.ref, 'refs/tags/')
run: |
PACKAGE_VER="v$(python setup.py --version)"
RELEASE_TAG="${GITHUB_REF#refs/*/}"
PACKAGE_VER="v$(python setup.py --version)"
if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then
echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})"
exit 1
fi
PACKAGE_VER="v$(python nvitop-exporter/setup.py --version)"
if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then
echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})"
exit 1
@ -163,10 +183,10 @@ jobs:
with:
user: __token__
password: ${{ secrets.TESTPYPI_UPLOAD_TOKEN }}
repository_url: https://test.pypi.org/legacy/
repository-url: https://test.pypi.org/legacy/
verbose: true
print_hash: true
skip_existing: true
print-hash: true
skip-existing: true
- name: Publish to PyPI
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
@ -175,5 +195,5 @@ jobs:
user: __token__
password: ${{ secrets.PYPI_UPLOAD_TOKEN }}
verbose: true
print_hash: true
skip_existing: true
print-hash: true
skip-existing: true

View file

@ -40,6 +40,10 @@ jobs:
- name: Check syntax (Python 3.7)
run: |
"${{ steps.py37.outputs.python-path }}" -m compileall nvitop
(
cd nvitop-exporter &&
"${{ steps.py37.outputs.python-path }}" -m compileall nvitop_exporter
)
- name: Upgrade pip
run: |
@ -67,6 +71,29 @@ jobs:
"${{ steps.py37.outputs.python-path }}" -m nvitop.select --version
"${{ steps.py37.outputs.python-path }}" -m nvitop.select --help
- name: Install dependencies for nvitop-exporter
run: |
python -m pip install -r nvitop-exporter/requirements.txt
- name: Import tests for nvitop-exporter
run: |
(
cd nvitop-exporter &&
python -c 'import nvitop_exporter' &&
python -m nvitop_exporter --version &&
python -m nvitop_exporter --help
)
- name: Import tests for nvitop-exporter (Python 3.7)
run: |
(
cd nvitop-exporter &&
"${{ steps.py37.outputs.python-path }}" -m pip install -r requirements.txt &&
"${{ steps.py37.outputs.python-path }}" -c 'import nvitop_exporter' &&
"${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --version &&
"${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --help
)
- name: Install linters
run: |
python -m pip install --upgrade pre-commit pylint[spelling] mypy typing-extensions

View file

@ -88,3 +88,7 @@ repos:
language: system
types_or: [python, pyi]
require_serial: true
exclude: |
(?x)(
^nvitop-exporter/setup.py$
)

View file

@ -421,7 +421,8 @@ confidence=HIGH,
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=consider-using-f-string,
duplicate-code
duplicate-code,
wrong-import-order
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option

View file

@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Add Prometheus exporter by [@XuehaiPan](https://github.com/XuehaiPan) in [#92](https://github.com/XuehaiPan/nvitop/pull/92).
- Add device APIs to query PCIe and NVLink throughput by [@XuehaiPan](https://github.com/XuehaiPan) in [#87](https://github.com/XuehaiPan/nvitop/pull/87).
### Changed

View file

@ -151,3 +151,5 @@ tx
rx
ThroughputInfo
pytorch
api
utils

202
nvitop-exporter/LICENSE Normal file
View file

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1 @@
include LICENSE

11
nvitop-exporter/README.md Normal file
View file

@ -0,0 +1,11 @@
# nvitop-exporter
Prometheus exporter built on top of `nvitop`.
## Installation
Install from PyPI:
```bash
pip3 install --upgrade nvitop-exporter
```

View file

@ -0,0 +1,24 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prometheus exporter built on top of ``nvitop``."""
from nvitop_exporter.exporter import PrometheusExporter
from nvitop_exporter.utils import get_ip_address
from nvitop_exporter.version import __version__
__all__ = ['PrometheusExporter', 'get_ip_address']

View file

@ -0,0 +1,25 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prometheus exporter built on top of ``nvitop``."""
import sys
from nvitop_exporter.cli import main
if __name__ == '__main__':
sys.exit(main())

View file

@ -0,0 +1,240 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prometheus exporter built on top of ``nvitop``."""
from __future__ import annotations
import argparse
import sys
from typing import TextIO
from prometheus_client import start_wsgi_server
import nvitop
from nvitop import Device, colored, libnvml
from nvitop_exporter.exporter import PrometheusExporter
from nvitop_exporter.utils import get_ip_address
from nvitop_exporter.version import __version__
def cprint(text: str = '', *, file: TextIO | None = None) -> None:
"""Print colored text to a file."""
for prefix, color in (
('INFO: ', 'yellow'),
('WARNING: ', 'yellow'),
('ERROR: ', 'red'),
('NVML ERROR: ', 'red'),
):
if text.startswith(prefix):
text = text.replace(
prefix.rstrip(),
colored(prefix.rstrip(), color=color, attrs=('bold',)),
1,
)
print(text, file=file)
def parse_arguments() -> argparse.Namespace:
"""Parse command-line arguments for ``nvitop-exporter``."""
def posfloat(argstring: str) -> float:
num = float(argstring)
if num <= 0:
raise ValueError
return num
posfloat.__name__ = 'positive float'
parser = argparse.ArgumentParser(
prog='nvitop-exporter',
description='Prometheus exporter built on top of `nvitop`.',
formatter_class=argparse.RawTextHelpFormatter,
add_help=False,
)
parser.add_argument(
'--help',
'-h',
dest='help',
action='help',
default=argparse.SUPPRESS,
help='Show this help message and exit.',
)
parser.add_argument(
'--version',
'-V',
dest='version',
action='version',
version=f'%(prog)s {__version__} (nvitop {nvitop.__version__})',
help="Show %(prog)s's version number and exit.",
)
parser.add_argument(
'--hostname',
'--host',
'-H',
dest='hostname',
type=str,
default=get_ip_address(),
metavar='HOSTNAME',
help='Hostname to display in the exporter. (default: %(default)s)',
)
parser.add_argument(
'--bind-address',
'--bind',
'-B',
dest='bind_address',
type=str,
default='127.0.0.1',
metavar='ADDRESS',
help='Local address to bind to. (default: %(default)s)',
)
parser.add_argument(
'--port',
'-p',
type=int,
default=8000,
help='Port to listen on. (default: %(default)d)',
)
parser.add_argument(
'--interval',
dest='interval',
type=posfloat,
default=1.0,
metavar='SEC',
help='Interval between updates in seconds. (default: %(default)s)',
)
args = parser.parse_args()
if args.interval < 0.25:
parser.error(
f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. '
f'Expected 1/4 or higher.',
)
return args
def main() -> int: # pylint: disable=too-many-locals,too-many-statements
"""Main function for ``nvitop-exporter`` CLI."""
args = parse_arguments()
try:
device_count = Device.count()
except libnvml.NVMLError_LibraryNotFound:
return 1
except libnvml.NVMLError as ex:
cprint(f'NVML ERROR: {ex}', file=sys.stderr)
return 1
if device_count == 0:
cprint('NVML ERROR: No NVIDIA devices found.', file=sys.stderr)
return 1
physical_devices = Device.from_indices(range(device_count))
mig_devices = []
for device in physical_devices:
mig_devices.extend(device.mig_devices())
cprint(
'INFO: Found {}{}.'.format(
colored(str(device_count), color='green', attrs=('bold',)),
(
' physical device(s) and {} MIG device(s)'.format(
colored(str(len(mig_devices)), color='blue', attrs=('bold',)),
)
if mig_devices
else ' device(s)'
),
),
file=sys.stderr,
)
devices = sorted(
physical_devices + mig_devices, # type: ignore[operator]
key=lambda d: (d.index,) if isinstance(d.index, int) else d.index,
)
for device in devices:
name = device.name()
uuid = device.uuid()
if device.is_mig_device():
name = name.rpartition(' ')[-1]
cprint(
f'INFO: MIG {name:<11} Device {device.mig_index:>2d}: (UUID: {uuid})',
file=sys.stderr,
)
else:
cprint(f'INFO: GPU {device.index}: {name} (UUID: {uuid})', file=sys.stderr)
exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval)
try:
start_wsgi_server(port=args.port, addr=args.bind_address)
except OSError as ex:
if 'address already in use' in str(ex).lower():
cprint(
(
'ERROR: Address {} is already in use. '
'Please specify a different port via `--port <PORT>`.'
).format(
colored(
f'http://{args.bind_address}:{args.port}',
color='blue',
attrs=('bold', 'underline'),
),
),
file=sys.stderr,
)
elif 'cannot assign requested address' in str(ex).lower():
cprint(
(
'ERROR: Cannot assign requested address at {}. '
'Please specify a different address via `--bind-address <ADDRESS>`.'
).format(
colored(
f'http://{args.bind_address}:{args.port}',
color='blue',
attrs=('bold', 'underline'),
),
),
file=sys.stderr,
)
else:
cprint(f'ERROR: {ex}', file=sys.stderr)
return 1
cprint(
'INFO: Start the exporter on {} at {}.'.format(
colored(args.hostname, color='magenta', attrs=('bold',)),
colored(
f'http://{args.bind_address}:{args.port}/metrics',
color='green',
attrs=('bold', 'underline'),
),
),
file=sys.stderr,
)
try:
exporter.collect()
except KeyboardInterrupt:
cprint(file=sys.stderr)
cprint('INFO: Interrupted by user.', file=sys.stderr)
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -0,0 +1,608 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prometheus exporter built on top of ``nvitop``."""
from __future__ import annotations
import math
import time
from typing import Sequence
from prometheus_client import REGISTRY, CollectorRegistry, Gauge, Info
from nvitop import Device, MiB, MigDevice, PhysicalDevice, host
from nvitop.api.process import GpuProcess
from nvitop_exporter.utils import get_ip_address
class PrometheusExporter: # pylint: disable=too-many-instance-attributes
"""Prometheus exporter built on top of ``nvitop``."""
def __init__( # pylint: disable=too-many-statements
self,
devices: Sequence[Device],
hostname: str | None = None,
*,
registry: CollectorRegistry = REGISTRY,
interval: float = 1.0,
) -> None:
"""Initialize the Prometheus exporter."""
if not isinstance(devices, (list, tuple)):
raise TypeError(f'Expected a list or tuple of devices, got {type(devices)}')
devices = list(devices)
for device in devices:
if not isinstance(device, (PhysicalDevice, MigDevice)):
raise TypeError(f'Expected a PhysicalDevice or MigDevice, got {type(device)}')
self.devices = devices
self.hostname = hostname or get_ip_address()
self.registry = registry
self.interval = interval
self.info = Info(
'nvitop',
documentation='NVITOP.',
labelnames=['hostname'],
registry=self.registry,
)
self.info.labels(hostname=self.hostname).info(
{
'device_count': str(Device.count()),
'driver_version': Device.driver_version(),
'cuda_driver_version': Device.cuda_driver_version(),
},
)
# Create gauges for host metrics
self.host_uptime = Gauge(
name='host_uptime',
documentation='Host uptime (s).',
unit='Second',
labelnames=['hostname'],
registry=self.registry,
)
self.host_cpu_percent = Gauge(
name='host_cpu_percent',
documentation='Host CPU percent (%).',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_virtual_memory_total = Gauge(
name='host_virtual_memory_total',
documentation='Host virtual memory total (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_virtual_memory_used = Gauge(
name='host_virtual_memory_used',
documentation='Host virtual memory used (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_virtual_memory_free = Gauge(
name='host_virtual_memory_free',
documentation='Host virtual memory free (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_virtual_memory_percent = Gauge(
name='host_virtual_memory_percent',
documentation='Host virtual memory percent (%).',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_swap_memory_total = Gauge(
name='host_swap_memory_total',
documentation='Host swap total (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_swap_memory_used = Gauge(
name='host_swap_memory_used',
documentation='Host swap used (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_swap_memory_free = Gauge(
name='host_swap_memory_free',
documentation='Host swap free (MiB).',
unit='MiB',
labelnames=['hostname'],
registry=self.registry,
)
self.host_swap_memory_percent = Gauge(
name='host_swap_memory_percent',
documentation='Host swap percent (%).',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_load_average_1m = Gauge(
name='host_load_average_1m',
documentation='Host load average for the last minute.',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_load_average_5m = Gauge(
name='host_load_average_5m',
documentation='Host load average for the last 5 minutes.',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_load_average_15m = Gauge(
name='host_load_average_15m',
documentation='Host load average for the last 15 minutes.',
unit='Percentage',
labelnames=['hostname'],
registry=self.registry,
)
self.host_net_io_tx_data = Gauge(
name='host_net_io_tx_data',
documentation='Host network I/O transmitted data (MiB).',
unit='MiB',
labelnames=['hostname', 'interface'],
registry=self.registry,
)
self.host_net_io_rx_data = Gauge(
name='host_net_io_rx_data',
documentation='Host network I/O received data (MiB).',
unit='MiB',
labelnames=['hostname', 'interface'],
registry=self.registry,
)
self.host_net_io_tx_packets = Gauge(
name='host_net_io_tx_packets',
documentation='Host network I/O transmitted packets.',
unit='Packet',
labelnames=['hostname', 'interface'],
registry=self.registry,
)
self.host_net_io_rx_packets = Gauge(
name='host_net_io_rx_packets',
documentation='Host network I/O received packets.',
unit='Packet',
labelnames=['hostname', 'interface'],
registry=self.registry,
)
self.host_disk_io_read_data = Gauge(
name='host_disk_io_read_data',
documentation='Host disk I/O read data (MiB).',
unit='MiB',
labelnames=['hostname', 'partition'],
registry=self.registry,
)
self.host_disk_io_write_data = Gauge(
name='host_disk_io_write_data',
documentation='Host disk I/O write data (MiB).',
unit='MiB',
labelnames=['hostname', 'partition'],
registry=self.registry,
)
self.host_disk_usage_total = Gauge(
name='host_disk_usage_total',
documentation='Host disk usage total (MiB).',
unit='MiB',
labelnames=['hostname', 'mountpoint'],
registry=self.registry,
)
self.host_disk_usage_used = Gauge(
name='host_disk_usage_used',
documentation='Host disk usage used (MiB).',
unit='MiB',
labelnames=['hostname', 'mountpoint'],
registry=self.registry,
)
self.host_disk_usage_free = Gauge(
name='host_disk_usage_free',
documentation='Host disk usage free (MiB).',
unit='MiB',
labelnames=['hostname', 'mountpoint'],
registry=self.registry,
)
self.host_disk_usage_percent = Gauge(
name='host_disk_usage_percent',
documentation='Host disk usage percent (%).',
unit='Percentage',
labelnames=['hostname', 'mountpoint'],
registry=self.registry,
)
# Create gauges for GPU metrics
self.gpu_utilization = Gauge(
name='gpu_utilization',
documentation='GPU utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_memory_utilization = Gauge(
name='gpu_memory_utilization',
documentation='GPU memory utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_encoder_utilization = Gauge(
name='gpu_encoder_utilization',
documentation='GPU encoder utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_decoder_utilization = Gauge(
name='gpu_decoder_utilization',
documentation='GPU decoder utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_memory_total = Gauge(
name='gpu_memory_total',
documentation='GPU memory total (MiB).',
unit='MiB',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_memory_used = Gauge(
name='gpu_memory_used',
documentation='GPU memory used (MiB).',
unit='MiB',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_memory_free = Gauge(
name='gpu_memory_free',
documentation='GPU memory free (MiB).',
unit='MiB',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_memory_percent = Gauge(
name='gpu_memory_percent',
documentation='GPU memory percent (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_clock_sm = Gauge(
name='gpu_clock_sm',
documentation='GPU SM clock (MHz).',
unit='MHz',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_clock_memory = Gauge(
name='gpu_clock_memory',
documentation='GPU memory clock (MHz).',
unit='MHz',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_clock_graphics = Gauge(
name='gpu_clock_graphics',
documentation='GPU graphics clock (MHz).',
unit='MHz',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_clock_video = Gauge(
name='gpu_clock_video',
documentation='GPU video clock (MHz).',
unit='MHz',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_power_usage = Gauge(
name='gpu_power_usage',
documentation='GPU power usage (W).',
unit='W',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_power_limit = Gauge(
name='gpu_power_limit',
documentation='GPU power limit (W).',
unit='W',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_temperature = Gauge(
name='gpu_temperature',
documentation='GPU temperature (C).',
unit='C',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_fan_speed = Gauge(
name='gpu_fan_speed',
documentation='GPU fan speed (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_pcie_tx_throughput = Gauge(
name='gpu_pcie_tx_throughput',
documentation='GPU PCIe transmit throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_pcie_rx_throughput = Gauge(
name='gpu_pcie_rx_throughput',
documentation='GPU PCIe receive throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_mean_tx_throughput = Gauge(
name='gpu_nvlink_mean_tx_throughput',
documentation='GPU mean NVLink transmit throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_mean_rx_throughput = Gauge(
name='gpu_nvlink_mean_rx_throughput',
documentation='GPU mean NVLink receive throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_tx_throughput = Gauge(
name='gpu_nvlink_tx_throughput',
documentation='GPU NVLink transmit throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'],
registry=self.registry,
)
self.gpu_nvlink_rx_throughput = Gauge(
name='gpu_nvlink_rx_throughput',
documentation='GPU NVLink receive throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'],
registry=self.registry,
)
# Create gauges for process metrics
self.process_running_time = Gauge(
name='process_running_time',
documentation='Process running time (s).',
unit='Second',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_cpu_percent = Gauge(
name='process_cpu_percent',
documentation='Process CPU percent (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_rss_memory = Gauge(
name='process_rss_memory',
documentation='Process memory resident set size (MiB).',
unit='MiB',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_memory_percent = Gauge(
name='process_memory_percent',
documentation='Process memory percent (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_gpu_memory = Gauge(
name='process_gpu_memory',
documentation='Process GPU memory (MiB).',
unit='MiB',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_gpu_sm_utilization = Gauge(
name='process_gpu_sm_utilization',
documentation='Process GPU SM utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_gpu_memory_utilization = Gauge(
name='process_gpu_memory_utilization',
documentation='Process GPU memory utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_gpu_encoder_utilization = Gauge(
name='process_gpu_encoder_utilization',
documentation='Process GPU encoder utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_gpu_decoder_utilization = Gauge(
name='process_gpu_decoder_utilization',
documentation='Process GPU decoder utilization (%).',
unit='Percentage',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
def collect(self) -> None:
"""Collect metrics."""
while True:
next_update_time = time.monotonic() + self.interval
self.update_host()
for device in self.devices:
self.update_device(device)
time.sleep(max(0.0, next_update_time - time.monotonic()))
def update_host(self) -> None:
"""Update metrics for the host."""
load_average = host.load_average()
if load_average is None:
load_average = (0.0, 0.0, 0.0) # type: ignore[unreachable]
virtual_memory = host.virtual_memory()
swap_memory = host.swap_memory()
net_io_counters = host.net_io_counters(pernic=True) # type: ignore[attr-defined]
disk_io_counters = host.disk_io_counters(perdisk=True) # type: ignore[attr-defined]
for gauge, value in (
(self.host_uptime, host.uptime()),
(self.host_cpu_percent, host.cpu_percent()),
(self.host_virtual_memory_total, virtual_memory.total / MiB),
(self.host_virtual_memory_used, virtual_memory.used / MiB),
(self.host_virtual_memory_free, virtual_memory.free / MiB),
(self.host_virtual_memory_percent, virtual_memory.percent),
(self.host_swap_memory_total, swap_memory.total / MiB),
(self.host_swap_memory_used, swap_memory.used / MiB),
(self.host_swap_memory_free, swap_memory.free / MiB),
(self.host_swap_memory_percent, swap_memory.percent),
(self.host_load_average_1m, load_average[0]),
(self.host_load_average_5m, load_average[1]),
(self.host_load_average_15m, load_average[2]),
):
gauge.labels(self.hostname).set(value)
for interface, net_io_counter in net_io_counters.items():
for gauge, value in (
(self.host_net_io_tx_data, net_io_counter.bytes_sent / MiB),
(self.host_net_io_rx_data, net_io_counter.bytes_recv / MiB),
(self.host_net_io_tx_packets, net_io_counter.packets_sent),
(self.host_net_io_rx_packets, net_io_counter.packets_recv),
):
gauge.labels(hostname=self.hostname, interface=interface).set(value)
for partition, disk_io_counter in disk_io_counters.items():
for gauge, value in (
(self.host_disk_io_read_data, disk_io_counter.read_bytes / MiB),
(self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB),
):
gauge.labels(hostname=self.hostname, partition=partition).set(value)
for partition in host.disk_partitions(): # type: ignore[attr-defined]
try:
partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined]
except (OSError, host.PsutilError):
continue
for gauge, value in (
(self.host_disk_usage_total, partition_usage.total / MiB),
(self.host_disk_usage_used, partition_usage.used / MiB),
(self.host_disk_usage_free, partition_usage.free / MiB),
(self.host_disk_usage_percent, partition_usage.percent),
):
gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value)
def update_device(self, device: Device) -> None:
"""Update metrics for a single device."""
index = (
str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index))
)
name = device.name()
uuid = device.uuid()
with device.oneshot():
for gauge, value in (
(self.gpu_utilization, float(device.gpu_utilization())),
(self.gpu_memory_utilization, float(device.memory_utilization())),
(self.gpu_encoder_utilization, float(device.encoder_utilization())),
(self.gpu_decoder_utilization, float(device.decoder_utilization())),
(self.gpu_memory_total, device.memory_total() / MiB),
(self.gpu_memory_used, device.memory_used() / MiB),
(self.gpu_memory_free, device.memory_free() / MiB),
(self.gpu_memory_percent, float(device.memory_percent())),
(self.gpu_clock_sm, float(device.clock_infos().sm)),
(self.gpu_clock_memory, float(device.clock_infos().memory)),
(self.gpu_clock_graphics, float(device.clock_infos().graphics)),
(self.gpu_clock_video, float(device.clock_infos().video)),
(self.gpu_power_usage, device.power_usage() / 1000.0),
(self.gpu_power_limit, device.power_limit() / 1000.0),
(self.gpu_temperature, float(device.temperature())),
(self.gpu_fan_speed, float(device.fan_speed())),
(self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0),
(self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0),
(self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0),
(self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0),
):
gauge.labels(
hostname=self.hostname,
index=index,
devicename=name,
uuid=uuid,
).set(value)
for gauge, nvlink_throughput in (
(self.gpu_nvlink_tx_throughput, device.nvlink_tx_throughput()),
(self.gpu_nvlink_rx_throughput, device.nvlink_rx_throughput()),
):
for link, throughput in enumerate(nvlink_throughput):
gauge.labels(
hostname=self.hostname,
index=index,
devicename=name,
uuid=uuid,
link=link,
).set(throughput / 1024.0)
with GpuProcess.failsafe():
for pid, process in device.processes().items():
with process.oneshot():
username = process.username()
running_time = process.running_time()
for gauge, value in (
(
self.process_running_time,
running_time.total_seconds() if running_time else math.nan,
),
(self.process_cpu_percent, process.cpu_percent()),
(self.process_rss_memory, process.host_memory() / MiB),
(self.process_memory_percent, float(process.memory_percent())),
(self.process_gpu_memory, process.gpu_memory() / MiB),
(
self.process_gpu_sm_utilization,
float(process.gpu_sm_utilization()),
),
(
self.process_gpu_memory_utilization,
float(process.gpu_memory_utilization()),
),
(
self.process_gpu_encoder_utilization,
float(process.gpu_encoder_utilization()),
),
(
self.process_gpu_decoder_utilization,
float(process.gpu_decoder_utilization()),
),
):
gauge.labels(
hostname=self.hostname,
index=index,
devicename=name,
uuid=uuid,
pid=pid,
username=username,
).set(value)

View file

@ -0,0 +1,38 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions for ``nvitop-exporter``."""
import socket
__all__ = ['get_ip_address']
# Reference: https://stackoverflow.com/a/28950776
def get_ip_address() -> str:
"""Get the IP address of the current machine."""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.settimeout(0.0)
try:
# Doesn't even have to be reachable
s.connect(('10.254.254.254', 1))
ip_address = s.getsockname()[0]
except Exception: # noqa: BLE001 # pylint: disable=broad-except
ip_address = '127.0.0.1'
finally:
s.close()
return ip_address

View file

@ -0,0 +1,54 @@
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prometheus exporter built on top of ``nvitop``."""
__version__ = '1.3.0'
__license__ = 'Apache-2.0'
__author__ = __maintainer__ = 'Xuehai Pan'
__email__ = 'XuehaiPan@pku.edu.cn'
__release__ = False
if not __release__:
import os
import subprocess
try:
prefix, sep, suffix = (
subprocess.check_output(
['git', 'describe', '--abbrev=7'], # noqa: S603,S607
cwd=os.path.dirname(os.path.abspath(__file__)),
stderr=subprocess.DEVNULL,
text=True,
)
.strip()
.lstrip('v')
.replace('-', '.dev', 1)
.replace('-', '+', 1)
.partition('.dev')
)
if sep:
version_prefix, dot, version_tail = prefix.rpartition('.')
prefix = f'{version_prefix}{dot}{int(version_tail) + 1}'
__version__ = sep.join((prefix, suffix))
del version_prefix, dot, version_tail
else:
__version__ = prefix
del prefix, sep, suffix
except (OSError, subprocess.CalledProcessError):
pass
del os, subprocess

View file

@ -0,0 +1,83 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[project]
name = "nvitop-exporter"
description = "Prometheus exporter built on top of `nvitop`."
readme = "README.md"
requires-python = ">= 3.7"
authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }]
license = { text = "Apache License, Version 2.0 (Apache-2.0)" }
keywords = [
"nvidia",
"nvidia-smi",
"NVIDIA",
"NVML",
"CUDA",
"GPU",
"top",
"monitoring",
"prometheus",
"Prometheus",
"grafana",
"Grafana",
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX :: Linux",
"Environment :: GPU",
"Environment :: GPU :: NVIDIA CUDA",
"Intended Audience :: Developers",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: System Administrators",
"Topic :: System :: Hardware",
"Topic :: System :: Monitoring",
"Topic :: System :: Systems Administration",
"Topic :: Utilities",
]
dependencies = [
# Sync with nvitop/version.py and requirements.txt
"nvitop == 1.3.0",
"prometheus-client >= 0.4.0",
]
dynamic = ["version"]
[project.scripts]
nvitop-exporter = "nvitop_exporter.cli:main"
[project.urls]
Homepage = "https://github.com/XuehaiPan/nvitop"
Repository = "https://github.com/XuehaiPan/nvitop"
Documentation = "https://nvitop.readthedocs.io"
"Bug Report" = "https://github.com/XuehaiPan/nvitop/issues"
[tool.setuptools.packages.find]
include = ["nvitop_exporter", "nvitop_exporter.*"]
[tool.black]
safe = true
line-length = 100
skip-string-normalization = true
target-version = ["py37", "py38", "py39", "py310", "py311"]
[tool.isort]
atomic = true
profile = "black"
src_paths = ["nvitop_exporter"]
known_first_party = ["nvitop", "nvitop_exporter"]
indent = 4
line_length = 100
lines_after_imports = 2
multi_line_output = 3
[tool.ruff]
extend = "../pyproject.toml"

View file

@ -0,0 +1,2 @@
nvitop
prometheus-client >= 0.4.0

44
nvitop-exporter/setup.py Executable file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""Setup script for ``nvitop-exporter``."""
import pathlib
import re
import sys
from setuptools import setup
HERE = pathlib.Path(__file__).absolute().parent
VERSION_FILE = HERE / 'nvitop_exporter' / 'version.py'
sys.path.insert(0, str(VERSION_FILE.parent))
# pylint: disable-next=import-error,wrong-import-position
import version # noqa
VERSION_CONTENT = None
try:
if not version.__release__:
try:
VERSION_CONTENT = VERSION_FILE.read_text(encoding='utf-8')
VERSION_FILE.write_text(
data=re.sub(
r"""__version__\s*=\s*('[^']+'|"[^"]+")""",
f'__version__ = {version.__version__!r}',
string=VERSION_CONTENT,
),
encoding='utf-8',
)
except OSError:
VERSION_CONTENT = None
setup(
name='nvitop-exporter',
version=version.__version__,
)
finally:
if VERSION_CONTENT is not None:
with VERSION_FILE.open(mode='wt', encoding='utf-8', newline='') as file:
file.write(VERSION_CONTENT)

View file

@ -29,18 +29,37 @@ from nvitop.api.device import (
)
from nvitop.api.libnvml import NVMLError, nvmlCheckReturn
from nvitop.api.process import GpuProcess, HostProcess, command_join
from nvitop.api.utils import * # noqa: F403
from nvitop.api.utils import ( # explicitly export these to appease mypy
NA,
SIZE_UNITS,
UINT_MAX,
ULONGLONG_MAX,
GiB,
KiB,
MiB,
NaType,
NotApplicable,
NotApplicableType,
PiB,
Snapshot,
TiB,
boolify,
bytes2human,
colored,
human2bytes,
set_color,
timedelta2human,
utilization2string,
)
__all__ = [
'take_snapshots',
'collect_in_background',
'ResourceMetricCollector',
'libnvml',
'nvmlCheckReturn',
'NVMLError',
'nvmlCheckReturn',
'libnvml',
'libcuda',
'libcudart',
# nvitop.api.device
'Device',
'PhysicalDevice',
'MigDevice',
@ -48,9 +67,34 @@ __all__ = [
'CudaMigDevice',
'parse_cuda_visible_devices',
'normalize_cuda_visible_devices',
# nvitop.api.process
'host',
'HostProcess',
'GpuProcess',
'command_join',
*utils.__all__,
# nvitop.api.collector
'take_snapshots',
'collect_in_background',
'ResourceMetricCollector',
# nvitop.api.utils
'NA',
'NaType',
'NotApplicable',
'NotApplicableType',
'UINT_MAX',
'ULONGLONG_MAX',
'KiB',
'MiB',
'GiB',
'TiB',
'PiB',
'SIZE_UNITS',
'bytes2human',
'human2bytes',
'timedelta2human',
'utilization2string',
'colored',
'set_color',
'boolify',
'Snapshot',
]

View file

@ -1154,7 +1154,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
gpu_percent = gpu_utilization # in percentage
def memory_utilization(self) -> float | NaType: # in percentage
def memory_utilization(self) -> int | NaType: # in percentage
"""Percent of time over the past sample period during which global (device) memory was being read or written.
The sample period may be between 1 second and 1/6 second depending on the product.
@ -1170,7 +1170,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
""" # pylint: disable=line-too-long
return self.utilization_rates().memory
def encoder_utilization(self) -> float | NaType: # in percentage
def encoder_utilization(self) -> int | NaType: # in percentage
"""The encoder utilization rate in percentage.
Returns: Union[int, NaType]
@ -1178,7 +1178,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
"""
return self.utilization_rates().encoder
def decoder_utilization(self) -> float | NaType: # in percentage\
def decoder_utilization(self) -> int | NaType: # in percentage
"""The decoder utilization rate in percentage.
Returns: Union[int, NaType]
@ -2120,8 +2120,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
self.handle,
# Only utilization samples that were recorded after this timestamp will be returned.
# The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used.
# Here we use the timestamp 1/4 second ago to ensure the record buffer is not empty.
time.time_ns() // 1000 - 250_000,
# Here we use the timestamp 1 second ago to ensure the record buffer is not empty.
time.time_ns() // 1000 - 1000_000,
default=(),
)
for s in sorted(samples, key=lambda s: s.timeStamp):

View file

@ -730,10 +730,11 @@ def memoize_when_activated(method: Method) -> Method:
"""
@functools.wraps(method)
def wrapped(self, *args, **kwargs): # noqa: ANN001,ANN002,ANN003,ANN202
def wrapped(self: object, *args: Any, **kwargs: Any) -> Any:
try:
# case 1: we previously entered oneshot() ctx
ret = self._cache[method] # pylint: disable=protected-access
# pylint: disable-next=protected-access
ret = self._cache[method] # type: ignore[attr-defined]
except AttributeError:
# case 2: we never entered oneshot() ctx
return method(self, *args, **kwargs)
@ -742,25 +743,28 @@ def memoize_when_activated(method: Method) -> Method:
# for this entry yet
ret = method(self, *args, **kwargs)
try:
self._cache[method] = ret # pylint: disable=protected-access
# pylint: disable-next=protected-access
self._cache[method] = ret # type: ignore[attr-defined]
except AttributeError:
# multi-threading race condition, see:
# https://github.com/giampaolo/psutil/issues/1948
pass
return ret
def cache_activate(self): # noqa: ANN001,ANN202
def cache_activate(self: object) -> None:
"""Activate cache.
Expects an instance. Cache will be stored as a "_cache" instance attribute.
"""
if not hasattr(self, '_cache'):
self._cache = {} # pylint: disable=protected-access
# pylint: disable-next=protected-access
self._cache = {} # type: ignore[attr-defined]
def cache_deactivate(self): # noqa: ANN001,ANN202
def cache_deactivate(self: object) -> None:
"""Deactivate and clear cache."""
try:
del self._cache # pylint: disable=protected-access
# pylint: disable-next=protected-access
del self._cache # type: ignore[attr-defined]
except AttributeError:
pass

View file

@ -24,7 +24,7 @@ NVITOP_MONITOR_MODE = set(
# pylint: disable=too-many-branches,too-many-statements
def parse_arguments() -> argparse.Namespace:
"""Parse command-line arguments for ``nvtiop``."""
"""Parse command-line arguments for ``nvitop``."""
coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format(
colored('light', 'green'),
colored('moderate', 'yellow'),

View file

@ -17,7 +17,7 @@
"""An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management."""
__version__ = '1.2.0'
__license__ = 'GPLv3'
__license__ = 'GPL-3.0-only AND Apache-2.0'
__author__ = __maintainer__ = 'Xuehai Pan'
__email__ = 'XuehaiPan@pku.edu.cn'
__release__ = False

View file

@ -76,7 +76,8 @@ target-version = ["py37", "py38", "py39", "py310", "py311"]
[tool.isort]
atomic = true
profile = "black"
src_paths = ["nvitop"]
src_paths = ["nvitop", "nvitop-exporter/nvitop_exporter"]
known_first_party = ["nvitop", "nvitop_exporter"]
indent = 4
line_length = 100
lines_after_imports = 2
@ -85,14 +86,16 @@ multi_line_output = 3
[tool.mypy]
# Sync with requires-python
python_version = 3.8 # appease mypy for syntax errors in numpy stubs
mypy_path = [".", "nvitop-exporter"]
exclude = ["nvitop-exporter/setup.py"]
pretty = true
show_error_codes = true
show_error_context = true
show_traceback = true
allow_redefinition = true
check_untyped_defs = true
disallow_incomplete_defs = false
disallow_untyped_defs = false
disallow_incomplete_defs = true
disallow_untyped_defs = true
ignore_missing_imports = true
no_implicit_optional = true
strict_equality = true
@ -119,7 +122,7 @@ ignore-words = "docs/source/spelling_wordlist.txt"
target-version = "py37"
line-length = 100
show-source = true
src = ["nvitop"]
src = ["nvitop", "nvitop-exporter/nvitop_exporter"]
select = [
"E", "W", # pycodestyle
"F", # pyflakes