mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 14:15:55 -06:00
feat(exporter): add Prometheus exporter (#92)
This commit is contained in:
parent
9ff3ec3400
commit
daf72c7bf3
24 changed files with 1475 additions and 37 deletions
42
.github/workflows/build.yaml
vendored
42
.github/workflows/build.yaml
vendored
|
|
@ -72,15 +72,22 @@ jobs:
|
|||
python -m venv venv &&
|
||||
(
|
||||
source venv/bin/activate &&
|
||||
python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions
|
||||
python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions &&
|
||||
python -m pip install -r requirements.txt &&
|
||||
python -m pip install -r nvitop-exporter/requirements.txt &&
|
||||
python -m pre_commit install --install-hooks &&
|
||||
python -m pre_commit run --all-files &&
|
||||
python -c 'import nvitop' &&
|
||||
python -m nvitop --version &&
|
||||
python -m nvitop --help &&
|
||||
python -m nvitop.select --version &&
|
||||
python -m nvitop.select --help
|
||||
python -m nvitop.select --help &&
|
||||
(
|
||||
cd nvitop-exporter &&
|
||||
python -c 'import nvitop_exporter' &&
|
||||
python -m nvitop_exporter --version &&
|
||||
python -m nvitop_exporter --help
|
||||
)
|
||||
)
|
||||
|
||||
- name: Test docker build
|
||||
|
|
@ -92,12 +99,17 @@ jobs:
|
|||
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py
|
||||
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py
|
||||
|
||||
- name: Print version
|
||||
run: python setup.py --version
|
||||
run: |
|
||||
python setup.py --version
|
||||
python nvitop-exporter/setup.py --version
|
||||
|
||||
- name: Build sdist and wheels
|
||||
run: python -m build
|
||||
run: |
|
||||
python -m build --outdir dist .
|
||||
python -m build --outdir dist nvitop-exporter
|
||||
|
||||
- name: List built sdist and wheels
|
||||
run: ls -lh dist/
|
||||
|
|
@ -135,15 +147,23 @@ jobs:
|
|||
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py
|
||||
sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py
|
||||
|
||||
- name: Print version
|
||||
run: python setup.py --version
|
||||
run: |
|
||||
python setup.py --version
|
||||
python nvitop-exporter/setup.py --version
|
||||
|
||||
- name: Check consistency between the package version and release tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
run: |
|
||||
PACKAGE_VER="v$(python setup.py --version)"
|
||||
RELEASE_TAG="${GITHUB_REF#refs/*/}"
|
||||
PACKAGE_VER="v$(python setup.py --version)"
|
||||
if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then
|
||||
echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})"
|
||||
exit 1
|
||||
fi
|
||||
PACKAGE_VER="v$(python nvitop-exporter/setup.py --version)"
|
||||
if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then
|
||||
echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})"
|
||||
exit 1
|
||||
|
|
@ -163,10 +183,10 @@ jobs:
|
|||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.TESTPYPI_UPLOAD_TOKEN }}
|
||||
repository_url: https://test.pypi.org/legacy/
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
verbose: true
|
||||
print_hash: true
|
||||
skip_existing: true
|
||||
print-hash: true
|
||||
skip-existing: true
|
||||
|
||||
- name: Publish to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch'
|
||||
|
|
@ -175,5 +195,5 @@ jobs:
|
|||
user: __token__
|
||||
password: ${{ secrets.PYPI_UPLOAD_TOKEN }}
|
||||
verbose: true
|
||||
print_hash: true
|
||||
skip_existing: true
|
||||
print-hash: true
|
||||
skip-existing: true
|
||||
|
|
|
|||
27
.github/workflows/lint.yaml
vendored
27
.github/workflows/lint.yaml
vendored
|
|
@ -40,6 +40,10 @@ jobs:
|
|||
- name: Check syntax (Python 3.7)
|
||||
run: |
|
||||
"${{ steps.py37.outputs.python-path }}" -m compileall nvitop
|
||||
(
|
||||
cd nvitop-exporter &&
|
||||
"${{ steps.py37.outputs.python-path }}" -m compileall nvitop_exporter
|
||||
)
|
||||
|
||||
- name: Upgrade pip
|
||||
run: |
|
||||
|
|
@ -67,6 +71,29 @@ jobs:
|
|||
"${{ steps.py37.outputs.python-path }}" -m nvitop.select --version
|
||||
"${{ steps.py37.outputs.python-path }}" -m nvitop.select --help
|
||||
|
||||
- name: Install dependencies for nvitop-exporter
|
||||
run: |
|
||||
python -m pip install -r nvitop-exporter/requirements.txt
|
||||
|
||||
- name: Import tests for nvitop-exporter
|
||||
run: |
|
||||
(
|
||||
cd nvitop-exporter &&
|
||||
python -c 'import nvitop_exporter' &&
|
||||
python -m nvitop_exporter --version &&
|
||||
python -m nvitop_exporter --help
|
||||
)
|
||||
|
||||
- name: Import tests for nvitop-exporter (Python 3.7)
|
||||
run: |
|
||||
(
|
||||
cd nvitop-exporter &&
|
||||
"${{ steps.py37.outputs.python-path }}" -m pip install -r requirements.txt &&
|
||||
"${{ steps.py37.outputs.python-path }}" -c 'import nvitop_exporter' &&
|
||||
"${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --version &&
|
||||
"${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --help
|
||||
)
|
||||
|
||||
- name: Install linters
|
||||
run: |
|
||||
python -m pip install --upgrade pre-commit pylint[spelling] mypy typing-extensions
|
||||
|
|
|
|||
|
|
@ -88,3 +88,7 @@ repos:
|
|||
language: system
|
||||
types_or: [python, pyi]
|
||||
require_serial: true
|
||||
exclude: |
|
||||
(?x)(
|
||||
^nvitop-exporter/setup.py$
|
||||
)
|
||||
|
|
|
|||
|
|
@ -421,7 +421,8 @@ confidence=HIGH,
|
|||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
||||
# --disable=W".
|
||||
disable=consider-using-f-string,
|
||||
duplicate-code
|
||||
duplicate-code,
|
||||
wrong-import-order
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
### Added
|
||||
|
||||
- Add Prometheus exporter by [@XuehaiPan](https://github.com/XuehaiPan) in [#92](https://github.com/XuehaiPan/nvitop/pull/92).
|
||||
- Add device APIs to query PCIe and NVLink throughput by [@XuehaiPan](https://github.com/XuehaiPan) in [#87](https://github.com/XuehaiPan/nvitop/pull/87).
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
|
|
@ -151,3 +151,5 @@ tx
|
|||
rx
|
||||
ThroughputInfo
|
||||
pytorch
|
||||
api
|
||||
utils
|
||||
|
|
|
|||
202
nvitop-exporter/LICENSE
Normal file
202
nvitop-exporter/LICENSE
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
1
nvitop-exporter/MANIFEST.in
Normal file
1
nvitop-exporter/MANIFEST.in
Normal file
|
|
@ -0,0 +1 @@
|
|||
include LICENSE
|
||||
11
nvitop-exporter/README.md
Normal file
11
nvitop-exporter/README.md
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
# nvitop-exporter
|
||||
|
||||
Prometheus exporter built on top of `nvitop`.
|
||||
|
||||
## Installation
|
||||
|
||||
Install from PyPI:
|
||||
|
||||
```bash
|
||||
pip3 install --upgrade nvitop-exporter
|
||||
```
|
||||
24
nvitop-exporter/nvitop_exporter/__init__.py
Normal file
24
nvitop-exporter/nvitop_exporter/__init__.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
from nvitop_exporter.exporter import PrometheusExporter
|
||||
from nvitop_exporter.utils import get_ip_address
|
||||
from nvitop_exporter.version import __version__
|
||||
|
||||
|
||||
__all__ = ['PrometheusExporter', 'get_ip_address']
|
||||
25
nvitop-exporter/nvitop_exporter/__main__.py
Normal file
25
nvitop-exporter/nvitop_exporter/__main__.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
import sys
|
||||
|
||||
from nvitop_exporter.cli import main
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
240
nvitop-exporter/nvitop_exporter/cli.py
Normal file
240
nvitop-exporter/nvitop_exporter/cli.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from typing import TextIO
|
||||
|
||||
from prometheus_client import start_wsgi_server
|
||||
|
||||
import nvitop
|
||||
from nvitop import Device, colored, libnvml
|
||||
from nvitop_exporter.exporter import PrometheusExporter
|
||||
from nvitop_exporter.utils import get_ip_address
|
||||
from nvitop_exporter.version import __version__
|
||||
|
||||
|
||||
def cprint(text: str = '', *, file: TextIO | None = None) -> None:
|
||||
"""Print colored text to a file."""
|
||||
for prefix, color in (
|
||||
('INFO: ', 'yellow'),
|
||||
('WARNING: ', 'yellow'),
|
||||
('ERROR: ', 'red'),
|
||||
('NVML ERROR: ', 'red'),
|
||||
):
|
||||
if text.startswith(prefix):
|
||||
text = text.replace(
|
||||
prefix.rstrip(),
|
||||
colored(prefix.rstrip(), color=color, attrs=('bold',)),
|
||||
1,
|
||||
)
|
||||
print(text, file=file)
|
||||
|
||||
|
||||
def parse_arguments() -> argparse.Namespace:
|
||||
"""Parse command-line arguments for ``nvitop-exporter``."""
|
||||
|
||||
def posfloat(argstring: str) -> float:
|
||||
num = float(argstring)
|
||||
if num <= 0:
|
||||
raise ValueError
|
||||
return num
|
||||
|
||||
posfloat.__name__ = 'positive float'
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='nvitop-exporter',
|
||||
description='Prometheus exporter built on top of `nvitop`.',
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
add_help=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--help',
|
||||
'-h',
|
||||
dest='help',
|
||||
action='help',
|
||||
default=argparse.SUPPRESS,
|
||||
help='Show this help message and exit.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
'-V',
|
||||
dest='version',
|
||||
action='version',
|
||||
version=f'%(prog)s {__version__} (nvitop {nvitop.__version__})',
|
||||
help="Show %(prog)s's version number and exit.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--hostname',
|
||||
'--host',
|
||||
'-H',
|
||||
dest='hostname',
|
||||
type=str,
|
||||
default=get_ip_address(),
|
||||
metavar='HOSTNAME',
|
||||
help='Hostname to display in the exporter. (default: %(default)s)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--bind-address',
|
||||
'--bind',
|
||||
'-B',
|
||||
dest='bind_address',
|
||||
type=str,
|
||||
default='127.0.0.1',
|
||||
metavar='ADDRESS',
|
||||
help='Local address to bind to. (default: %(default)s)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--port',
|
||||
'-p',
|
||||
type=int,
|
||||
default=8000,
|
||||
help='Port to listen on. (default: %(default)d)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--interval',
|
||||
dest='interval',
|
||||
type=posfloat,
|
||||
default=1.0,
|
||||
metavar='SEC',
|
||||
help='Interval between updates in seconds. (default: %(default)s)',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.interval < 0.25:
|
||||
parser.error(
|
||||
f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. '
|
||||
f'Expected 1/4 or higher.',
|
||||
)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main() -> int: # pylint: disable=too-many-locals,too-many-statements
|
||||
"""Main function for ``nvitop-exporter`` CLI."""
|
||||
args = parse_arguments()
|
||||
|
||||
try:
|
||||
device_count = Device.count()
|
||||
except libnvml.NVMLError_LibraryNotFound:
|
||||
return 1
|
||||
except libnvml.NVMLError as ex:
|
||||
cprint(f'NVML ERROR: {ex}', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if device_count == 0:
|
||||
cprint('NVML ERROR: No NVIDIA devices found.', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
physical_devices = Device.from_indices(range(device_count))
|
||||
mig_devices = []
|
||||
for device in physical_devices:
|
||||
mig_devices.extend(device.mig_devices())
|
||||
cprint(
|
||||
'INFO: Found {}{}.'.format(
|
||||
colored(str(device_count), color='green', attrs=('bold',)),
|
||||
(
|
||||
' physical device(s) and {} MIG device(s)'.format(
|
||||
colored(str(len(mig_devices)), color='blue', attrs=('bold',)),
|
||||
)
|
||||
if mig_devices
|
||||
else ' device(s)'
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
devices = sorted(
|
||||
physical_devices + mig_devices, # type: ignore[operator]
|
||||
key=lambda d: (d.index,) if isinstance(d.index, int) else d.index,
|
||||
)
|
||||
for device in devices:
|
||||
name = device.name()
|
||||
uuid = device.uuid()
|
||||
if device.is_mig_device():
|
||||
name = name.rpartition(' ')[-1]
|
||||
cprint(
|
||||
f'INFO: MIG {name:<11} Device {device.mig_index:>2d}: (UUID: {uuid})',
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
cprint(f'INFO: GPU {device.index}: {name} (UUID: {uuid})', file=sys.stderr)
|
||||
|
||||
exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval)
|
||||
|
||||
try:
|
||||
start_wsgi_server(port=args.port, addr=args.bind_address)
|
||||
except OSError as ex:
|
||||
if 'address already in use' in str(ex).lower():
|
||||
cprint(
|
||||
(
|
||||
'ERROR: Address {} is already in use. '
|
||||
'Please specify a different port via `--port <PORT>`.'
|
||||
).format(
|
||||
colored(
|
||||
f'http://{args.bind_address}:{args.port}',
|
||||
color='blue',
|
||||
attrs=('bold', 'underline'),
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
elif 'cannot assign requested address' in str(ex).lower():
|
||||
cprint(
|
||||
(
|
||||
'ERROR: Cannot assign requested address at {}. '
|
||||
'Please specify a different address via `--bind-address <ADDRESS>`.'
|
||||
).format(
|
||||
colored(
|
||||
f'http://{args.bind_address}:{args.port}',
|
||||
color='blue',
|
||||
attrs=('bold', 'underline'),
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
cprint(f'ERROR: {ex}', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
cprint(
|
||||
'INFO: Start the exporter on {} at {}.'.format(
|
||||
colored(args.hostname, color='magenta', attrs=('bold',)),
|
||||
colored(
|
||||
f'http://{args.bind_address}:{args.port}/metrics',
|
||||
color='green',
|
||||
attrs=('bold', 'underline'),
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
try:
|
||||
exporter.collect()
|
||||
except KeyboardInterrupt:
|
||||
cprint(file=sys.stderr)
|
||||
cprint('INFO: Interrupted by user.', file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
608
nvitop-exporter/nvitop_exporter/exporter.py
Normal file
608
nvitop-exporter/nvitop_exporter/exporter.py
Normal file
|
|
@ -0,0 +1,608 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import time
|
||||
from typing import Sequence
|
||||
|
||||
from prometheus_client import REGISTRY, CollectorRegistry, Gauge, Info
|
||||
|
||||
from nvitop import Device, MiB, MigDevice, PhysicalDevice, host
|
||||
from nvitop.api.process import GpuProcess
|
||||
from nvitop_exporter.utils import get_ip_address
|
||||
|
||||
|
||||
class PrometheusExporter: # pylint: disable=too-many-instance-attributes
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
def __init__( # pylint: disable=too-many-statements
|
||||
self,
|
||||
devices: Sequence[Device],
|
||||
hostname: str | None = None,
|
||||
*,
|
||||
registry: CollectorRegistry = REGISTRY,
|
||||
interval: float = 1.0,
|
||||
) -> None:
|
||||
"""Initialize the Prometheus exporter."""
|
||||
if not isinstance(devices, (list, tuple)):
|
||||
raise TypeError(f'Expected a list or tuple of devices, got {type(devices)}')
|
||||
devices = list(devices)
|
||||
|
||||
for device in devices:
|
||||
if not isinstance(device, (PhysicalDevice, MigDevice)):
|
||||
raise TypeError(f'Expected a PhysicalDevice or MigDevice, got {type(device)}')
|
||||
|
||||
self.devices = devices
|
||||
self.hostname = hostname or get_ip_address()
|
||||
self.registry = registry
|
||||
self.interval = interval
|
||||
|
||||
self.info = Info(
|
||||
'nvitop',
|
||||
documentation='NVITOP.',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.info.labels(hostname=self.hostname).info(
|
||||
{
|
||||
'device_count': str(Device.count()),
|
||||
'driver_version': Device.driver_version(),
|
||||
'cuda_driver_version': Device.cuda_driver_version(),
|
||||
},
|
||||
)
|
||||
|
||||
# Create gauges for host metrics
|
||||
self.host_uptime = Gauge(
|
||||
name='host_uptime',
|
||||
documentation='Host uptime (s).',
|
||||
unit='Second',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_cpu_percent = Gauge(
|
||||
name='host_cpu_percent',
|
||||
documentation='Host CPU percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_virtual_memory_total = Gauge(
|
||||
name='host_virtual_memory_total',
|
||||
documentation='Host virtual memory total (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_virtual_memory_used = Gauge(
|
||||
name='host_virtual_memory_used',
|
||||
documentation='Host virtual memory used (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_virtual_memory_free = Gauge(
|
||||
name='host_virtual_memory_free',
|
||||
documentation='Host virtual memory free (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_virtual_memory_percent = Gauge(
|
||||
name='host_virtual_memory_percent',
|
||||
documentation='Host virtual memory percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_swap_memory_total = Gauge(
|
||||
name='host_swap_memory_total',
|
||||
documentation='Host swap total (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_swap_memory_used = Gauge(
|
||||
name='host_swap_memory_used',
|
||||
documentation='Host swap used (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_swap_memory_free = Gauge(
|
||||
name='host_swap_memory_free',
|
||||
documentation='Host swap free (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_swap_memory_percent = Gauge(
|
||||
name='host_swap_memory_percent',
|
||||
documentation='Host swap percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_load_average_1m = Gauge(
|
||||
name='host_load_average_1m',
|
||||
documentation='Host load average for the last minute.',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_load_average_5m = Gauge(
|
||||
name='host_load_average_5m',
|
||||
documentation='Host load average for the last 5 minutes.',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_load_average_15m = Gauge(
|
||||
name='host_load_average_15m',
|
||||
documentation='Host load average for the last 15 minutes.',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_net_io_tx_data = Gauge(
|
||||
name='host_net_io_tx_data',
|
||||
documentation='Host network I/O transmitted data (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'interface'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_net_io_rx_data = Gauge(
|
||||
name='host_net_io_rx_data',
|
||||
documentation='Host network I/O received data (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'interface'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_net_io_tx_packets = Gauge(
|
||||
name='host_net_io_tx_packets',
|
||||
documentation='Host network I/O transmitted packets.',
|
||||
unit='Packet',
|
||||
labelnames=['hostname', 'interface'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_net_io_rx_packets = Gauge(
|
||||
name='host_net_io_rx_packets',
|
||||
documentation='Host network I/O received packets.',
|
||||
unit='Packet',
|
||||
labelnames=['hostname', 'interface'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_io_read_data = Gauge(
|
||||
name='host_disk_io_read_data',
|
||||
documentation='Host disk I/O read data (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'partition'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_io_write_data = Gauge(
|
||||
name='host_disk_io_write_data',
|
||||
documentation='Host disk I/O write data (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'partition'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_usage_total = Gauge(
|
||||
name='host_disk_usage_total',
|
||||
documentation='Host disk usage total (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'mountpoint'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_usage_used = Gauge(
|
||||
name='host_disk_usage_used',
|
||||
documentation='Host disk usage used (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'mountpoint'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_usage_free = Gauge(
|
||||
name='host_disk_usage_free',
|
||||
documentation='Host disk usage free (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'mountpoint'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.host_disk_usage_percent = Gauge(
|
||||
name='host_disk_usage_percent',
|
||||
documentation='Host disk usage percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'mountpoint'],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Create gauges for GPU metrics
|
||||
self.gpu_utilization = Gauge(
|
||||
name='gpu_utilization',
|
||||
documentation='GPU utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_memory_utilization = Gauge(
|
||||
name='gpu_memory_utilization',
|
||||
documentation='GPU memory utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_encoder_utilization = Gauge(
|
||||
name='gpu_encoder_utilization',
|
||||
documentation='GPU encoder utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_decoder_utilization = Gauge(
|
||||
name='gpu_decoder_utilization',
|
||||
documentation='GPU decoder utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_memory_total = Gauge(
|
||||
name='gpu_memory_total',
|
||||
documentation='GPU memory total (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_memory_used = Gauge(
|
||||
name='gpu_memory_used',
|
||||
documentation='GPU memory used (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_memory_free = Gauge(
|
||||
name='gpu_memory_free',
|
||||
documentation='GPU memory free (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_memory_percent = Gauge(
|
||||
name='gpu_memory_percent',
|
||||
documentation='GPU memory percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_clock_sm = Gauge(
|
||||
name='gpu_clock_sm',
|
||||
documentation='GPU SM clock (MHz).',
|
||||
unit='MHz',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_clock_memory = Gauge(
|
||||
name='gpu_clock_memory',
|
||||
documentation='GPU memory clock (MHz).',
|
||||
unit='MHz',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_clock_graphics = Gauge(
|
||||
name='gpu_clock_graphics',
|
||||
documentation='GPU graphics clock (MHz).',
|
||||
unit='MHz',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_clock_video = Gauge(
|
||||
name='gpu_clock_video',
|
||||
documentation='GPU video clock (MHz).',
|
||||
unit='MHz',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_power_usage = Gauge(
|
||||
name='gpu_power_usage',
|
||||
documentation='GPU power usage (W).',
|
||||
unit='W',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_power_limit = Gauge(
|
||||
name='gpu_power_limit',
|
||||
documentation='GPU power limit (W).',
|
||||
unit='W',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_temperature = Gauge(
|
||||
name='gpu_temperature',
|
||||
documentation='GPU temperature (C).',
|
||||
unit='C',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_fan_speed = Gauge(
|
||||
name='gpu_fan_speed',
|
||||
documentation='GPU fan speed (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_pcie_tx_throughput = Gauge(
|
||||
name='gpu_pcie_tx_throughput',
|
||||
documentation='GPU PCIe transmit throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_pcie_rx_throughput = Gauge(
|
||||
name='gpu_pcie_rx_throughput',
|
||||
documentation='GPU PCIe receive throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_nvlink_mean_tx_throughput = Gauge(
|
||||
name='gpu_nvlink_mean_tx_throughput',
|
||||
documentation='GPU mean NVLink transmit throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_nvlink_mean_rx_throughput = Gauge(
|
||||
name='gpu_nvlink_mean_rx_throughput',
|
||||
documentation='GPU mean NVLink receive throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_nvlink_tx_throughput = Gauge(
|
||||
name='gpu_nvlink_tx_throughput',
|
||||
documentation='GPU NVLink transmit throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.gpu_nvlink_rx_throughput = Gauge(
|
||||
name='gpu_nvlink_rx_throughput',
|
||||
documentation='GPU NVLink receive throughput (MiB/s).',
|
||||
unit='MiBps',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Create gauges for process metrics
|
||||
self.process_running_time = Gauge(
|
||||
name='process_running_time',
|
||||
documentation='Process running time (s).',
|
||||
unit='Second',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_cpu_percent = Gauge(
|
||||
name='process_cpu_percent',
|
||||
documentation='Process CPU percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_rss_memory = Gauge(
|
||||
name='process_rss_memory',
|
||||
documentation='Process memory resident set size (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_memory_percent = Gauge(
|
||||
name='process_memory_percent',
|
||||
documentation='Process memory percent (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_gpu_memory = Gauge(
|
||||
name='process_gpu_memory',
|
||||
documentation='Process GPU memory (MiB).',
|
||||
unit='MiB',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_gpu_sm_utilization = Gauge(
|
||||
name='process_gpu_sm_utilization',
|
||||
documentation='Process GPU SM utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_gpu_memory_utilization = Gauge(
|
||||
name='process_gpu_memory_utilization',
|
||||
documentation='Process GPU memory utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_gpu_encoder_utilization = Gauge(
|
||||
name='process_gpu_encoder_utilization',
|
||||
documentation='Process GPU encoder utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
self.process_gpu_decoder_utilization = Gauge(
|
||||
name='process_gpu_decoder_utilization',
|
||||
documentation='Process GPU decoder utilization (%).',
|
||||
unit='Percentage',
|
||||
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
def collect(self) -> None:
|
||||
"""Collect metrics."""
|
||||
while True:
|
||||
next_update_time = time.monotonic() + self.interval
|
||||
self.update_host()
|
||||
for device in self.devices:
|
||||
self.update_device(device)
|
||||
time.sleep(max(0.0, next_update_time - time.monotonic()))
|
||||
|
||||
def update_host(self) -> None:
|
||||
"""Update metrics for the host."""
|
||||
load_average = host.load_average()
|
||||
if load_average is None:
|
||||
load_average = (0.0, 0.0, 0.0) # type: ignore[unreachable]
|
||||
virtual_memory = host.virtual_memory()
|
||||
swap_memory = host.swap_memory()
|
||||
net_io_counters = host.net_io_counters(pernic=True) # type: ignore[attr-defined]
|
||||
disk_io_counters = host.disk_io_counters(perdisk=True) # type: ignore[attr-defined]
|
||||
|
||||
for gauge, value in (
|
||||
(self.host_uptime, host.uptime()),
|
||||
(self.host_cpu_percent, host.cpu_percent()),
|
||||
(self.host_virtual_memory_total, virtual_memory.total / MiB),
|
||||
(self.host_virtual_memory_used, virtual_memory.used / MiB),
|
||||
(self.host_virtual_memory_free, virtual_memory.free / MiB),
|
||||
(self.host_virtual_memory_percent, virtual_memory.percent),
|
||||
(self.host_swap_memory_total, swap_memory.total / MiB),
|
||||
(self.host_swap_memory_used, swap_memory.used / MiB),
|
||||
(self.host_swap_memory_free, swap_memory.free / MiB),
|
||||
(self.host_swap_memory_percent, swap_memory.percent),
|
||||
(self.host_load_average_1m, load_average[0]),
|
||||
(self.host_load_average_5m, load_average[1]),
|
||||
(self.host_load_average_15m, load_average[2]),
|
||||
):
|
||||
gauge.labels(self.hostname).set(value)
|
||||
|
||||
for interface, net_io_counter in net_io_counters.items():
|
||||
for gauge, value in (
|
||||
(self.host_net_io_tx_data, net_io_counter.bytes_sent / MiB),
|
||||
(self.host_net_io_rx_data, net_io_counter.bytes_recv / MiB),
|
||||
(self.host_net_io_tx_packets, net_io_counter.packets_sent),
|
||||
(self.host_net_io_rx_packets, net_io_counter.packets_recv),
|
||||
):
|
||||
gauge.labels(hostname=self.hostname, interface=interface).set(value)
|
||||
|
||||
for partition, disk_io_counter in disk_io_counters.items():
|
||||
for gauge, value in (
|
||||
(self.host_disk_io_read_data, disk_io_counter.read_bytes / MiB),
|
||||
(self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB),
|
||||
):
|
||||
gauge.labels(hostname=self.hostname, partition=partition).set(value)
|
||||
for partition in host.disk_partitions(): # type: ignore[attr-defined]
|
||||
try:
|
||||
partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined]
|
||||
except (OSError, host.PsutilError):
|
||||
continue
|
||||
for gauge, value in (
|
||||
(self.host_disk_usage_total, partition_usage.total / MiB),
|
||||
(self.host_disk_usage_used, partition_usage.used / MiB),
|
||||
(self.host_disk_usage_free, partition_usage.free / MiB),
|
||||
(self.host_disk_usage_percent, partition_usage.percent),
|
||||
):
|
||||
gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value)
|
||||
|
||||
def update_device(self, device: Device) -> None:
|
||||
"""Update metrics for a single device."""
|
||||
index = (
|
||||
str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index))
|
||||
)
|
||||
name = device.name()
|
||||
uuid = device.uuid()
|
||||
|
||||
with device.oneshot():
|
||||
for gauge, value in (
|
||||
(self.gpu_utilization, float(device.gpu_utilization())),
|
||||
(self.gpu_memory_utilization, float(device.memory_utilization())),
|
||||
(self.gpu_encoder_utilization, float(device.encoder_utilization())),
|
||||
(self.gpu_decoder_utilization, float(device.decoder_utilization())),
|
||||
(self.gpu_memory_total, device.memory_total() / MiB),
|
||||
(self.gpu_memory_used, device.memory_used() / MiB),
|
||||
(self.gpu_memory_free, device.memory_free() / MiB),
|
||||
(self.gpu_memory_percent, float(device.memory_percent())),
|
||||
(self.gpu_clock_sm, float(device.clock_infos().sm)),
|
||||
(self.gpu_clock_memory, float(device.clock_infos().memory)),
|
||||
(self.gpu_clock_graphics, float(device.clock_infos().graphics)),
|
||||
(self.gpu_clock_video, float(device.clock_infos().video)),
|
||||
(self.gpu_power_usage, device.power_usage() / 1000.0),
|
||||
(self.gpu_power_limit, device.power_limit() / 1000.0),
|
||||
(self.gpu_temperature, float(device.temperature())),
|
||||
(self.gpu_fan_speed, float(device.fan_speed())),
|
||||
(self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0),
|
||||
(self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0),
|
||||
(self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0),
|
||||
(self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0),
|
||||
):
|
||||
gauge.labels(
|
||||
hostname=self.hostname,
|
||||
index=index,
|
||||
devicename=name,
|
||||
uuid=uuid,
|
||||
).set(value)
|
||||
|
||||
for gauge, nvlink_throughput in (
|
||||
(self.gpu_nvlink_tx_throughput, device.nvlink_tx_throughput()),
|
||||
(self.gpu_nvlink_rx_throughput, device.nvlink_rx_throughput()),
|
||||
):
|
||||
for link, throughput in enumerate(nvlink_throughput):
|
||||
gauge.labels(
|
||||
hostname=self.hostname,
|
||||
index=index,
|
||||
devicename=name,
|
||||
uuid=uuid,
|
||||
link=link,
|
||||
).set(throughput / 1024.0)
|
||||
|
||||
with GpuProcess.failsafe():
|
||||
for pid, process in device.processes().items():
|
||||
with process.oneshot():
|
||||
username = process.username()
|
||||
running_time = process.running_time()
|
||||
for gauge, value in (
|
||||
(
|
||||
self.process_running_time,
|
||||
running_time.total_seconds() if running_time else math.nan,
|
||||
),
|
||||
(self.process_cpu_percent, process.cpu_percent()),
|
||||
(self.process_rss_memory, process.host_memory() / MiB),
|
||||
(self.process_memory_percent, float(process.memory_percent())),
|
||||
(self.process_gpu_memory, process.gpu_memory() / MiB),
|
||||
(
|
||||
self.process_gpu_sm_utilization,
|
||||
float(process.gpu_sm_utilization()),
|
||||
),
|
||||
(
|
||||
self.process_gpu_memory_utilization,
|
||||
float(process.gpu_memory_utilization()),
|
||||
),
|
||||
(
|
||||
self.process_gpu_encoder_utilization,
|
||||
float(process.gpu_encoder_utilization()),
|
||||
),
|
||||
(
|
||||
self.process_gpu_decoder_utilization,
|
||||
float(process.gpu_decoder_utilization()),
|
||||
),
|
||||
):
|
||||
gauge.labels(
|
||||
hostname=self.hostname,
|
||||
index=index,
|
||||
devicename=name,
|
||||
uuid=uuid,
|
||||
pid=pid,
|
||||
username=username,
|
||||
).set(value)
|
||||
38
nvitop-exporter/nvitop_exporter/utils.py
Normal file
38
nvitop-exporter/nvitop_exporter/utils.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Utility functions for ``nvitop-exporter``."""
|
||||
|
||||
import socket
|
||||
|
||||
|
||||
__all__ = ['get_ip_address']
|
||||
|
||||
|
||||
# Reference: https://stackoverflow.com/a/28950776
|
||||
def get_ip_address() -> str:
|
||||
"""Get the IP address of the current machine."""
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.settimeout(0.0)
|
||||
try:
|
||||
# Doesn't even have to be reachable
|
||||
s.connect(('10.254.254.254', 1))
|
||||
ip_address = s.getsockname()[0]
|
||||
except Exception: # noqa: BLE001 # pylint: disable=broad-except
|
||||
ip_address = '127.0.0.1'
|
||||
finally:
|
||||
s.close()
|
||||
return ip_address
|
||||
54
nvitop-exporter/nvitop_exporter/version.py
Normal file
54
nvitop-exporter/nvitop_exporter/version.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
|
||||
#
|
||||
# Copyright 2021-2023 Xuehai Pan. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Prometheus exporter built on top of ``nvitop``."""
|
||||
|
||||
__version__ = '1.3.0'
|
||||
__license__ = 'Apache-2.0'
|
||||
__author__ = __maintainer__ = 'Xuehai Pan'
|
||||
__email__ = 'XuehaiPan@pku.edu.cn'
|
||||
__release__ = False
|
||||
|
||||
if not __release__:
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
prefix, sep, suffix = (
|
||||
subprocess.check_output(
|
||||
['git', 'describe', '--abbrev=7'], # noqa: S603,S607
|
||||
cwd=os.path.dirname(os.path.abspath(__file__)),
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
.strip()
|
||||
.lstrip('v')
|
||||
.replace('-', '.dev', 1)
|
||||
.replace('-', '+', 1)
|
||||
.partition('.dev')
|
||||
)
|
||||
if sep:
|
||||
version_prefix, dot, version_tail = prefix.rpartition('.')
|
||||
prefix = f'{version_prefix}{dot}{int(version_tail) + 1}'
|
||||
__version__ = sep.join((prefix, suffix))
|
||||
del version_prefix, dot, version_tail
|
||||
else:
|
||||
__version__ = prefix
|
||||
del prefix, sep, suffix
|
||||
except (OSError, subprocess.CalledProcessError):
|
||||
pass
|
||||
|
||||
del os, subprocess
|
||||
83
nvitop-exporter/pyproject.toml
Normal file
83
nvitop-exporter/pyproject.toml
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
[build-system]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "nvitop-exporter"
|
||||
description = "Prometheus exporter built on top of `nvitop`."
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.7"
|
||||
authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }]
|
||||
license = { text = "Apache License, Version 2.0 (Apache-2.0)" }
|
||||
keywords = [
|
||||
"nvidia",
|
||||
"nvidia-smi",
|
||||
"NVIDIA",
|
||||
"NVML",
|
||||
"CUDA",
|
||||
"GPU",
|
||||
"top",
|
||||
"monitoring",
|
||||
"prometheus",
|
||||
"Prometheus",
|
||||
"grafana",
|
||||
"Grafana",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Operating System :: Microsoft :: Windows",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Environment :: GPU",
|
||||
"Environment :: GPU :: NVIDIA CUDA",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"Intended Audience :: System Administrators",
|
||||
"Topic :: System :: Hardware",
|
||||
"Topic :: System :: Monitoring",
|
||||
"Topic :: System :: Systems Administration",
|
||||
"Topic :: Utilities",
|
||||
]
|
||||
dependencies = [
|
||||
# Sync with nvitop/version.py and requirements.txt
|
||||
"nvitop == 1.3.0",
|
||||
"prometheus-client >= 0.4.0",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
[project.scripts]
|
||||
nvitop-exporter = "nvitop_exporter.cli:main"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/XuehaiPan/nvitop"
|
||||
Repository = "https://github.com/XuehaiPan/nvitop"
|
||||
Documentation = "https://nvitop.readthedocs.io"
|
||||
"Bug Report" = "https://github.com/XuehaiPan/nvitop/issues"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["nvitop_exporter", "nvitop_exporter.*"]
|
||||
|
||||
[tool.black]
|
||||
safe = true
|
||||
line-length = 100
|
||||
skip-string-normalization = true
|
||||
target-version = ["py37", "py38", "py39", "py310", "py311"]
|
||||
|
||||
[tool.isort]
|
||||
atomic = true
|
||||
profile = "black"
|
||||
src_paths = ["nvitop_exporter"]
|
||||
known_first_party = ["nvitop", "nvitop_exporter"]
|
||||
indent = 4
|
||||
line_length = 100
|
||||
lines_after_imports = 2
|
||||
multi_line_output = 3
|
||||
|
||||
[tool.ruff]
|
||||
extend = "../pyproject.toml"
|
||||
2
nvitop-exporter/requirements.txt
Normal file
2
nvitop-exporter/requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
nvitop
|
||||
prometheus-client >= 0.4.0
|
||||
44
nvitop-exporter/setup.py
Executable file
44
nvitop-exporter/setup.py
Executable file
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""Setup script for ``nvitop-exporter``."""
|
||||
|
||||
import pathlib
|
||||
import re
|
||||
import sys
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
HERE = pathlib.Path(__file__).absolute().parent
|
||||
VERSION_FILE = HERE / 'nvitop_exporter' / 'version.py'
|
||||
|
||||
sys.path.insert(0, str(VERSION_FILE.parent))
|
||||
# pylint: disable-next=import-error,wrong-import-position
|
||||
import version # noqa
|
||||
|
||||
|
||||
VERSION_CONTENT = None
|
||||
|
||||
try:
|
||||
if not version.__release__:
|
||||
try:
|
||||
VERSION_CONTENT = VERSION_FILE.read_text(encoding='utf-8')
|
||||
VERSION_FILE.write_text(
|
||||
data=re.sub(
|
||||
r"""__version__\s*=\s*('[^']+'|"[^"]+")""",
|
||||
f'__version__ = {version.__version__!r}',
|
||||
string=VERSION_CONTENT,
|
||||
),
|
||||
encoding='utf-8',
|
||||
)
|
||||
except OSError:
|
||||
VERSION_CONTENT = None
|
||||
|
||||
setup(
|
||||
name='nvitop-exporter',
|
||||
version=version.__version__,
|
||||
)
|
||||
finally:
|
||||
if VERSION_CONTENT is not None:
|
||||
with VERSION_FILE.open(mode='wt', encoding='utf-8', newline='') as file:
|
||||
file.write(VERSION_CONTENT)
|
||||
|
|
@ -29,18 +29,37 @@ from nvitop.api.device import (
|
|||
)
|
||||
from nvitop.api.libnvml import NVMLError, nvmlCheckReturn
|
||||
from nvitop.api.process import GpuProcess, HostProcess, command_join
|
||||
from nvitop.api.utils import * # noqa: F403
|
||||
from nvitop.api.utils import ( # explicitly export these to appease mypy
|
||||
NA,
|
||||
SIZE_UNITS,
|
||||
UINT_MAX,
|
||||
ULONGLONG_MAX,
|
||||
GiB,
|
||||
KiB,
|
||||
MiB,
|
||||
NaType,
|
||||
NotApplicable,
|
||||
NotApplicableType,
|
||||
PiB,
|
||||
Snapshot,
|
||||
TiB,
|
||||
boolify,
|
||||
bytes2human,
|
||||
colored,
|
||||
human2bytes,
|
||||
set_color,
|
||||
timedelta2human,
|
||||
utilization2string,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'take_snapshots',
|
||||
'collect_in_background',
|
||||
'ResourceMetricCollector',
|
||||
'libnvml',
|
||||
'nvmlCheckReturn',
|
||||
'NVMLError',
|
||||
'nvmlCheckReturn',
|
||||
'libnvml',
|
||||
'libcuda',
|
||||
'libcudart',
|
||||
# nvitop.api.device
|
||||
'Device',
|
||||
'PhysicalDevice',
|
||||
'MigDevice',
|
||||
|
|
@ -48,9 +67,34 @@ __all__ = [
|
|||
'CudaMigDevice',
|
||||
'parse_cuda_visible_devices',
|
||||
'normalize_cuda_visible_devices',
|
||||
# nvitop.api.process
|
||||
'host',
|
||||
'HostProcess',
|
||||
'GpuProcess',
|
||||
'command_join',
|
||||
*utils.__all__,
|
||||
# nvitop.api.collector
|
||||
'take_snapshots',
|
||||
'collect_in_background',
|
||||
'ResourceMetricCollector',
|
||||
# nvitop.api.utils
|
||||
'NA',
|
||||
'NaType',
|
||||
'NotApplicable',
|
||||
'NotApplicableType',
|
||||
'UINT_MAX',
|
||||
'ULONGLONG_MAX',
|
||||
'KiB',
|
||||
'MiB',
|
||||
'GiB',
|
||||
'TiB',
|
||||
'PiB',
|
||||
'SIZE_UNITS',
|
||||
'bytes2human',
|
||||
'human2bytes',
|
||||
'timedelta2human',
|
||||
'utilization2string',
|
||||
'colored',
|
||||
'set_color',
|
||||
'boolify',
|
||||
'Snapshot',
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1154,7 +1154,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
|
||||
gpu_percent = gpu_utilization # in percentage
|
||||
|
||||
def memory_utilization(self) -> float | NaType: # in percentage
|
||||
def memory_utilization(self) -> int | NaType: # in percentage
|
||||
"""Percent of time over the past sample period during which global (device) memory was being read or written.
|
||||
|
||||
The sample period may be between 1 second and 1/6 second depending on the product.
|
||||
|
|
@ -1170,7 +1170,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
""" # pylint: disable=line-too-long
|
||||
return self.utilization_rates().memory
|
||||
|
||||
def encoder_utilization(self) -> float | NaType: # in percentage
|
||||
def encoder_utilization(self) -> int | NaType: # in percentage
|
||||
"""The encoder utilization rate in percentage.
|
||||
|
||||
Returns: Union[int, NaType]
|
||||
|
|
@ -1178,7 +1178,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
"""
|
||||
return self.utilization_rates().encoder
|
||||
|
||||
def decoder_utilization(self) -> float | NaType: # in percentage\
|
||||
def decoder_utilization(self) -> int | NaType: # in percentage
|
||||
"""The decoder utilization rate in percentage.
|
||||
|
||||
Returns: Union[int, NaType]
|
||||
|
|
@ -2120,8 +2120,8 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me
|
|||
self.handle,
|
||||
# Only utilization samples that were recorded after this timestamp will be returned.
|
||||
# The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used.
|
||||
# Here we use the timestamp 1/4 second ago to ensure the record buffer is not empty.
|
||||
time.time_ns() // 1000 - 250_000,
|
||||
# Here we use the timestamp 1 second ago to ensure the record buffer is not empty.
|
||||
time.time_ns() // 1000 - 1000_000,
|
||||
default=(),
|
||||
)
|
||||
for s in sorted(samples, key=lambda s: s.timeStamp):
|
||||
|
|
|
|||
|
|
@ -730,10 +730,11 @@ def memoize_when_activated(method: Method) -> Method:
|
|||
"""
|
||||
|
||||
@functools.wraps(method)
|
||||
def wrapped(self, *args, **kwargs): # noqa: ANN001,ANN002,ANN003,ANN202
|
||||
def wrapped(self: object, *args: Any, **kwargs: Any) -> Any:
|
||||
try:
|
||||
# case 1: we previously entered oneshot() ctx
|
||||
ret = self._cache[method] # pylint: disable=protected-access
|
||||
# pylint: disable-next=protected-access
|
||||
ret = self._cache[method] # type: ignore[attr-defined]
|
||||
except AttributeError:
|
||||
# case 2: we never entered oneshot() ctx
|
||||
return method(self, *args, **kwargs)
|
||||
|
|
@ -742,25 +743,28 @@ def memoize_when_activated(method: Method) -> Method:
|
|||
# for this entry yet
|
||||
ret = method(self, *args, **kwargs)
|
||||
try:
|
||||
self._cache[method] = ret # pylint: disable=protected-access
|
||||
# pylint: disable-next=protected-access
|
||||
self._cache[method] = ret # type: ignore[attr-defined]
|
||||
except AttributeError:
|
||||
# multi-threading race condition, see:
|
||||
# https://github.com/giampaolo/psutil/issues/1948
|
||||
pass
|
||||
return ret
|
||||
|
||||
def cache_activate(self): # noqa: ANN001,ANN202
|
||||
def cache_activate(self: object) -> None:
|
||||
"""Activate cache.
|
||||
|
||||
Expects an instance. Cache will be stored as a "_cache" instance attribute.
|
||||
"""
|
||||
if not hasattr(self, '_cache'):
|
||||
self._cache = {} # pylint: disable=protected-access
|
||||
# pylint: disable-next=protected-access
|
||||
self._cache = {} # type: ignore[attr-defined]
|
||||
|
||||
def cache_deactivate(self): # noqa: ANN001,ANN202
|
||||
def cache_deactivate(self: object) -> None:
|
||||
"""Deactivate and clear cache."""
|
||||
try:
|
||||
del self._cache # pylint: disable=protected-access
|
||||
# pylint: disable-next=protected-access
|
||||
del self._cache # type: ignore[attr-defined]
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ NVITOP_MONITOR_MODE = set(
|
|||
|
||||
# pylint: disable=too-many-branches,too-many-statements
|
||||
def parse_arguments() -> argparse.Namespace:
|
||||
"""Parse command-line arguments for ``nvtiop``."""
|
||||
"""Parse command-line arguments for ``nvitop``."""
|
||||
coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format(
|
||||
colored('light', 'green'),
|
||||
colored('moderate', 'yellow'),
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
"""An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management."""
|
||||
|
||||
__version__ = '1.2.0'
|
||||
__license__ = 'GPLv3'
|
||||
__license__ = 'GPL-3.0-only AND Apache-2.0'
|
||||
__author__ = __maintainer__ = 'Xuehai Pan'
|
||||
__email__ = 'XuehaiPan@pku.edu.cn'
|
||||
__release__ = False
|
||||
|
|
|
|||
|
|
@ -76,7 +76,8 @@ target-version = ["py37", "py38", "py39", "py310", "py311"]
|
|||
[tool.isort]
|
||||
atomic = true
|
||||
profile = "black"
|
||||
src_paths = ["nvitop"]
|
||||
src_paths = ["nvitop", "nvitop-exporter/nvitop_exporter"]
|
||||
known_first_party = ["nvitop", "nvitop_exporter"]
|
||||
indent = 4
|
||||
line_length = 100
|
||||
lines_after_imports = 2
|
||||
|
|
@ -85,14 +86,16 @@ multi_line_output = 3
|
|||
[tool.mypy]
|
||||
# Sync with requires-python
|
||||
python_version = 3.8 # appease mypy for syntax errors in numpy stubs
|
||||
mypy_path = [".", "nvitop-exporter"]
|
||||
exclude = ["nvitop-exporter/setup.py"]
|
||||
pretty = true
|
||||
show_error_codes = true
|
||||
show_error_context = true
|
||||
show_traceback = true
|
||||
allow_redefinition = true
|
||||
check_untyped_defs = true
|
||||
disallow_incomplete_defs = false
|
||||
disallow_untyped_defs = false
|
||||
disallow_incomplete_defs = true
|
||||
disallow_untyped_defs = true
|
||||
ignore_missing_imports = true
|
||||
no_implicit_optional = true
|
||||
strict_equality = true
|
||||
|
|
@ -119,7 +122,7 @@ ignore-words = "docs/source/spelling_wordlist.txt"
|
|||
target-version = "py37"
|
||||
line-length = 100
|
||||
show-source = true
|
||||
src = ["nvitop"]
|
||||
src = ["nvitop", "nvitop-exporter/nvitop_exporter"]
|
||||
select = [
|
||||
"E", "W", # pycodestyle
|
||||
"F", # pyflakes
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue