diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 987b891..dcc823b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,3 +88,7 @@ repos: language: system types_or: [python, pyi] require_serial: true + exclude: | + (?x)( + ^nvitop-exporter/setup.py$ + ) diff --git a/.pylintrc b/.pylintrc index ee0583f..8a0628a 100644 --- a/.pylintrc +++ b/.pylintrc @@ -421,7 +421,8 @@ confidence=HIGH, # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=consider-using-f-string, - duplicate-code + duplicate-code, + wrong-import-order # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 106df4f..e87a36d 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -151,3 +151,5 @@ tx rx ThroughputInfo pytorch +api +utils diff --git a/nvitop-exporter/LICENSE b/nvitop-exporter/LICENSE new file mode 100644 index 0000000..1fcc34a --- /dev/null +++ b/nvitop-exporter/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021-2023 Xuehai Pan. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/nvitop-exporter/MANIFEST.in b/nvitop-exporter/MANIFEST.in new file mode 100644 index 0000000..1aba38f --- /dev/null +++ b/nvitop-exporter/MANIFEST.in @@ -0,0 +1 @@ +include LICENSE diff --git a/nvitop-exporter/README.md b/nvitop-exporter/README.md new file mode 100644 index 0000000..3599f62 --- /dev/null +++ b/nvitop-exporter/README.md @@ -0,0 +1,11 @@ +# nvitop-exporter + +Prometheus exporter built on top of `nvitop`. + +## Installation + +Install from PyPI: + +```bash +pip3 install --upgrade nvitop-exporter +``` diff --git a/nvitop-exporter/nvitop_exporter/__init__.py b/nvitop-exporter/nvitop_exporter/__init__.py new file mode 100644 index 0000000..67ddb81 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/__init__.py @@ -0,0 +1,24 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from nvitop_exporter.exporter import PrometheusExporter +from nvitop_exporter.utils import get_ip_address +from nvitop_exporter.version import __version__ + + +__all__ = ['PrometheusExporter', 'get_ip_address'] diff --git a/nvitop-exporter/nvitop_exporter/__main__.py b/nvitop-exporter/nvitop_exporter/__main__.py new file mode 100644 index 0000000..9c76a7f --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/__main__.py @@ -0,0 +1,25 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +import sys + +from nvitop_exporter.cli import main + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/nvitop-exporter/nvitop_exporter/cli.py b/nvitop-exporter/nvitop_exporter/cli.py new file mode 100644 index 0000000..d74834a --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/cli.py @@ -0,0 +1,227 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from __future__ import annotations + +import argparse +import sys + +from prometheus_client import start_wsgi_server + +import nvitop +from nvitop import Device, colored, libnvml +from nvitop_exporter.exporter import PrometheusExporter +from nvitop_exporter.utils import get_ip_address +from nvitop_exporter.version import __version__ + + +def parse_arguments() -> argparse.Namespace: + """Parse command-line arguments for ``nvitop-exporter``.""" + + def posfloat(argstring: str) -> float: + num = float(argstring) + if num <= 0: + raise ValueError + return num + + posfloat.__name__ = 'positive float' + + parser = argparse.ArgumentParser( + prog='nvitop-exporter', + description='Prometheus exporter built on top of `nvitop`.', + formatter_class=argparse.RawTextHelpFormatter, + add_help=False, + ) + parser.add_argument( + '--help', + '-h', + dest='help', + action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.', + ) + parser.add_argument( + '--version', + '-V', + dest='version', + action='version', + version=f'%(prog)s {__version__} (nvitop {nvitop.__version__})', + help="Show %(prog)s's version number and exit.", + ) + + parser.add_argument( + '--hostname', + '--host', + '-H', + dest='hostname', + type=str, + default=get_ip_address(), + metavar='HOSTNAME', + help='Hostname to display in the exporter. (default: %(default)s)', + ) + parser.add_argument( + '--bind-address', + '--bind', + '-B', + dest='bind_address', + type=str, + default='127.0.0.1', + metavar='ADDRESS', + help='Local address to bind to. (default: %(default)s)', + ) + parser.add_argument( + '--port', + '-p', + type=int, + default=8000, + help='Port to listen on. (default: %(default)d)', + ) + parser.add_argument( + '--interval', + dest='interval', + type=posfloat, + default=1.0, + metavar='SEC', + help='Interval between updates in seconds. (default: %(default)s)', + ) + + args = parser.parse_args() + if args.interval < 0.25: + parser.error( + f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. ' + f'Expected 1/4 or higher.', + ) + + return args + + +def main() -> int: # pylint: disable=too-many-locals,too-many-statements + """Main function for ``nvitop-exporter`` CLI.""" + args = parse_arguments() + + try: + device_count = Device.count() + except libnvml.NVMLError_LibraryNotFound: + return 1 + except libnvml.NVMLError as ex: + print( + '{} {}'.format(colored('NVML ERROR:', color='red', attrs=('bold',)), ex), + file=sys.stderr, + ) + return 1 + + if device_count == 0: + print( + '{} {}'.format( + colored('NVML ERROR:', color='red', attrs=('bold',)), + 'No NVIDIA devices found.', + ), + file=sys.stderr, + ) + return 1 + + devices = Device.from_indices(range(device_count)) + print( + '{} Found {} NVIDIA device(s).'.format( + colored('INFO:', color='yellow', attrs=('bold',)), + colored(str(device_count), color='green', attrs=('bold',)), + ), + file=sys.stderr, + ) + for device in devices: + print( + '{} {}'.format( + colored('INFO:', color='yellow', attrs=('bold',)), + f'[{device.index}] {device.name()}', + ), + file=sys.stderr, + ) + + exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval) + + try: + start_wsgi_server(port=args.port, addr=args.bind_address) + except OSError as ex: + if 'address already in use' in str(ex).lower(): + print( + '{} {}'.format( + colored('ERROR:', color='red', attrs=('bold',)), + 'Address {} is already in use.'.format( + colored( + f'http://{args.bind_address}:{args.port}', + color='yellow', + attrs=('bold', 'underline'), + ), + ), + ), + file=sys.stderr, + ) + elif 'cannot assign requested address' in str(ex).lower(): + print( + '{} {}'.format( + colored('ERROR:', color='red', attrs=('bold',)), + 'Cannot assign requested address at {}.'.format( + colored( + f'http://{args.bind_address}:{args.port}', + color='yellow', + attrs=('bold', 'underline'), + ), + ), + ), + file=sys.stderr, + ) + else: + print( + '{} {}'.format( + colored('ERROR:', color='red', attrs=('bold',)), + ex, + ), + file=sys.stderr, + ) + return 1 + + print( + '{} Start the exporter on {} at {}.'.format( + colored('INFO:', color='yellow', attrs=('bold',)), + colored(args.hostname, color='magenta', attrs=('bold',)), + colored( + f'http://{args.bind_address}:{args.port}/metrics', + color='green', + attrs=('bold', 'underline'), + ), + ), + file=sys.stderr, + ) + + try: + exporter.collect() + except KeyboardInterrupt: + print(file=sys.stderr) + print( + '{} {}'.format( + colored('INFO:', color='yellow', attrs=('bold',)), + 'Interrupted by user.', + ), + file=sys.stderr, + ) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/nvitop-exporter/nvitop_exporter/exporter.py b/nvitop-exporter/nvitop_exporter/exporter.py new file mode 100644 index 0000000..7fa6e27 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/exporter.py @@ -0,0 +1,612 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from __future__ import annotations + +import time +from typing import Sequence + +from prometheus_client import REGISTRY, CollectorRegistry, Gauge, Info + +from nvitop import Device, MiB, MigDevice, PhysicalDevice, host +from nvitop.api.process import GpuProcess +from nvitop_exporter.utils import get_ip_address + + +class PrometheusExporter: # pylint: disable=too-many-instance-attributes + """Prometheus exporter built on top of ``nvitop``.""" + + def __init__( # pylint: disable=too-many-statements + self, + devices: Sequence[Device], + hostname: str | None = None, + *, + registry: CollectorRegistry = REGISTRY, + interval: float = 1.0, + ) -> None: + """Initialize the Prometheus exporter.""" + if not isinstance(devices, (list, tuple)): + raise TypeError(f'Expected a list or tuple of devices, got {type(devices)}') + + for device in devices: + if not isinstance(device, (PhysicalDevice, MigDevice)): + raise TypeError(f'Expected a PhysicalDevice or MigDevice, got {type(device)}') + + all_devices = [] + for device in devices: + all_devices.append(device) + if isinstance(device, PhysicalDevice): + all_devices.extend(device.mig_devices()) + + self.devices = all_devices + self.hostname = hostname or get_ip_address() + self.registry = registry + self.interval = interval + + self.info = Info( + 'nvitop', + documentation='NVITOP.', + labelnames=['hostname'], + registry=self.registry, + ) + self.info.labels(hostname=self.hostname).info( + { + 'device_count': str(Device.count()), + 'driver_version': Device.driver_version(), + 'cuda_driver_version': Device.cuda_driver_version(), + }, + ) + + # Create gauges for host metrics + self.host_uptime = Gauge( + name='host_uptime', + documentation='Host uptime (s).', + unit='Second', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_cpu_percent = Gauge( + name='host_cpu_percent', + documentation='Host CPU percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_total = Gauge( + name='host_virtual_memory_total', + documentation='Host virtual memory total (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_used = Gauge( + name='host_virtual_memory_used', + documentation='Host virtual memory used (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_free = Gauge( + name='host_virtual_memory_free', + documentation='Host virtual memory free (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_percent = Gauge( + name='host_virtual_memory_percent', + documentation='Host virtual memory percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_total = Gauge( + name='host_swap_memory_total', + documentation='Host swap total (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_used = Gauge( + name='host_swap_memory_used', + documentation='Host swap used (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_free = Gauge( + name='host_swap_memory_free', + documentation='Host swap free (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_percent = Gauge( + name='host_swap_memory_percent', + documentation='Host swap percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_1m = Gauge( + name='host_load_average_1m', + documentation='Host load average for the last minute.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_5m = Gauge( + name='host_load_average_5m', + documentation='Host load average for the last 5 minutes.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_15m = Gauge( + name='host_load_average_15m', + documentation='Host load average for the last 15 minutes.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_net_io_tx_data = Gauge( + name='host_net_io_tx_data', + documentation='Host network I/O transmitted data (MiB).', + unit='MiB', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_rx_data = Gauge( + name='host_net_io_rx_data', + documentation='Host network I/O received data (MiB).', + unit='MiB', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_tx_packets = Gauge( + name='host_net_io_tx_packets', + documentation='Host network I/O transmitted packets.', + unit='Packet', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_rx_packets = Gauge( + name='host_net_io_rx_packets', + documentation='Host network I/O received packets.', + unit='Packet', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_disk_io_read_data = Gauge( + name='host_disk_io_read_data', + documentation='Host disk I/O read data (MiB).', + unit='MiB', + labelnames=['hostname', 'partition'], + registry=self.registry, + ) + self.host_disk_io_write_data = Gauge( + name='host_disk_io_write_data', + documentation='Host disk I/O write data (MiB).', + unit='MiB', + labelnames=['hostname', 'partition'], + registry=self.registry, + ) + self.host_disk_usage_total = Gauge( + name='host_disk_usage_total', + documentation='Host disk usage total (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_used = Gauge( + name='host_disk_usage_used', + documentation='Host disk usage used (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_free = Gauge( + name='host_disk_usage_free', + documentation='Host disk usage free (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_percent = Gauge( + name='host_disk_usage_percent', + documentation='Host disk usage percent (%).', + unit='Percentage', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + + # Create gauges for GPU metrics + self.gpu_utilization = Gauge( + name='gpu_utilization', + documentation='GPU utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_utilization = Gauge( + name='gpu_memory_utilization', + documentation='GPU memory utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_encoder_utilization = Gauge( + name='gpu_encoder_utilization', + documentation='GPU encoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_decoder_utilization = Gauge( + name='gpu_decoder_utilization', + documentation='GPU decoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_total = Gauge( + name='gpu_memory_total', + documentation='GPU memory total (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_used = Gauge( + name='gpu_memory_used', + documentation='GPU memory used (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_free = Gauge( + name='gpu_memory_free', + documentation='GPU memory free (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_percent = Gauge( + name='gpu_memory_percent', + documentation='GPU memory percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_sm = Gauge( + name='gpu_clock_sm', + documentation='GPU SM clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_memory = Gauge( + name='gpu_clock_memory', + documentation='GPU memory clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_graphics = Gauge( + name='gpu_clock_graphics', + documentation='GPU graphics clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_video = Gauge( + name='gpu_clock_video', + documentation='GPU video clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_power_usage = Gauge( + name='gpu_power_usage', + documentation='GPU power usage (W).', + unit='W', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_power_limit = Gauge( + name='gpu_power_limit', + documentation='GPU power limit (W).', + unit='W', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_temperature = Gauge( + name='gpu_temperature', + documentation='GPU temperature (C).', + unit='C', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_fan_speed = Gauge( + name='gpu_fan_speed', + documentation='GPU fan speed (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_pcie_tx_throughput = Gauge( + name='gpu_pcie_tx_throughput', + documentation='GPU PCIe transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_pcie_rx_throughput = Gauge( + name='gpu_pcie_rx_throughput', + documentation='GPU PCIe receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_mean_tx_throughput = Gauge( + name='gpu_nvlink_mean_tx_throughput', + documentation='GPU mean NVLink transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_mean_rx_throughput = Gauge( + name='gpu_nvlink_mean_rx_throughput', + documentation='GPU mean NVLink receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_tx_throughput = Gauge( + name='gpu_nvlink_tx_throughput', + documentation='GPU NVLink transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], + registry=self.registry, + ) + self.gpu_nvlink_rx_throughput = Gauge( + name='gpu_nvlink_rx_throughput', + documentation='GPU NVLink receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], + registry=self.registry, + ) + + # Create gauges for process metrics + self.process_running_time = Gauge( + name='process_running_time', + documentation='Process running time (s).', + unit='Second', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_cpu_percent = Gauge( + name='process_cpu_percent', + documentation='Process CPU percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_rss_memory = Gauge( + name='process_rss_memory', + documentation='Process memory resident set size (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_memory_percent = Gauge( + name='process_memory_percent', + documentation='Process memory percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_memory = Gauge( + name='process_gpu_memory', + documentation='Process GPU memory (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_sm_utilization = Gauge( + name='process_gpu_sm_utilization', + documentation='Process GPU SM utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_memory_utilization = Gauge( + name='process_gpu_memory_utilization', + documentation='Process GPU memory utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_encoder_utilization = Gauge( + name='process_gpu_encoder_utilization', + documentation='Process GPU encoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_decoder_utilization = Gauge( + name='process_gpu_decoder_utilization', + documentation='Process GPU decoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + + def collect(self) -> None: + """Collect metrics.""" + while True: + next_update_time = time.monotonic() + self.interval + self.update_host() + for device in self.devices: + self.update_device(device) + time.sleep(max(0.0, next_update_time - time.monotonic())) + + def update_host(self) -> None: + """Update metrics for the host.""" + load_average = host.load_average() + if load_average is None: + load_average = (0.0, 0.0, 0.0) # type: ignore[unreachable] + virtual_memory = host.virtual_memory() + swap_memory = host.swap_memory() + net_io_counters = host.net_io_counters(pernic=True) # type: ignore[attr-defined] + disk_io_counters = host.disk_io_counters(perdisk=True) # type: ignore[attr-defined] + + for gauge, value in ( + (self.host_uptime, host.uptime()), + (self.host_cpu_percent, host.cpu_percent()), + (self.host_virtual_memory_total, virtual_memory.total / MiB), + (self.host_virtual_memory_used, virtual_memory.used / MiB), + (self.host_virtual_memory_free, virtual_memory.free / MiB), + (self.host_virtual_memory_percent, virtual_memory.percent), + (self.host_swap_memory_total, swap_memory.total / MiB), + (self.host_swap_memory_used, swap_memory.used / MiB), + (self.host_swap_memory_free, swap_memory.free / MiB), + (self.host_swap_memory_percent, swap_memory.percent), + (self.host_load_average_1m, load_average[0]), + (self.host_load_average_5m, load_average[1]), + (self.host_load_average_15m, load_average[2]), + ): + gauge.labels(self.hostname).set(value) + + for interface, net_io_counter in net_io_counters.items(): + for gauge, value in ( + (self.host_net_io_tx_data, net_io_counter.bytes_sent / MiB), + (self.host_net_io_rx_data, net_io_counter.bytes_recv / MiB), + (self.host_net_io_tx_packets, net_io_counter.packets_sent), + (self.host_net_io_rx_packets, net_io_counter.packets_recv), + ): + gauge.labels(hostname=self.hostname, interface=interface).set(value) + + for partition, disk_io_counter in disk_io_counters.items(): + for gauge, value in ( + (self.host_disk_io_read_data, disk_io_counter.read_bytes / MiB), + (self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB), + ): + gauge.labels(hostname=self.hostname, partition=partition).set(value) + for partition in host.disk_partitions(): # type: ignore[attr-defined] + try: + partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined] + except (OSError, host.PsutilError): + continue + for gauge, value in ( + (self.host_disk_usage_total, partition_usage.total / MiB), + (self.host_disk_usage_used, partition_usage.used / MiB), + (self.host_disk_usage_free, partition_usage.free / MiB), + (self.host_disk_usage_percent, partition_usage.percent), + ): + gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value) + + def update_device(self, device: Device) -> None: + """Update metrics for a single device.""" + index = ( + str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index)) + ) + name = device.name() + uuid = device.uuid() + + with device.oneshot(): + for gauge, value in ( + (self.gpu_utilization, int(device.gpu_utilization())), + (self.gpu_memory_utilization, int(device.memory_utilization())), + (self.gpu_encoder_utilization, int(device.encoder_utilization())), + (self.gpu_decoder_utilization, int(device.decoder_utilization())), + (self.gpu_memory_total, device.memory_total() / MiB), + (self.gpu_memory_used, device.memory_used() / MiB), + (self.gpu_memory_free, device.memory_free() / MiB), + (self.gpu_memory_percent, float(device.memory_percent())), + (self.gpu_clock_sm, int(device.clock_infos().sm)), + (self.gpu_clock_memory, int(device.clock_infos().memory)), + (self.gpu_clock_graphics, int(device.clock_infos().graphics)), + (self.gpu_clock_video, int(device.clock_infos().video)), + (self.gpu_power_usage, device.power_usage() / 1000.0), + (self.gpu_power_limit, device.power_limit() / 1000.0), + (self.gpu_temperature, int(device.temperature())), + (self.gpu_fan_speed, int(device.fan_speed())), + (self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0), + (self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0), + (self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0), + (self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0), + ): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + ).set(value) + + for gauge, nvlink_throughput in ( + (self.gpu_nvlink_tx_throughput, device.nvlink_tx_throughput()), + (self.gpu_nvlink_rx_throughput, device.nvlink_rx_throughput()), + ): + for link, throughput in enumerate(nvlink_throughput): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + link=link, + ).set(throughput / 1024.0) + + with GpuProcess.failsafe(): + for pid, process in device.processes().items(): + with process.oneshot(): + username = process.username() + running_time = process.running_time() + for gauge, value in ( + ( + self.process_running_time, + running_time.total_seconds() if running_time else 0.0, + ), + (self.process_cpu_percent, process.cpu_percent()), + (self.process_rss_memory, process.host_memory() / MiB), + (self.process_memory_percent, float(process.memory_percent())), + (self.process_gpu_memory, process.gpu_memory() / MiB), + ( + self.process_gpu_sm_utilization, + int(process.gpu_sm_utilization()), + ), + ( + self.process_gpu_memory_utilization, + int(process.gpu_memory_utilization()), + ), + ( + self.process_gpu_encoder_utilization, + int(process.gpu_encoder_utilization()), + ), + ( + self.process_gpu_decoder_utilization, + int(process.gpu_decoder_utilization()), + ), + ): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + pid=pid, + username=username, + ).set(value) diff --git a/nvitop-exporter/nvitop_exporter/utils.py b/nvitop-exporter/nvitop_exporter/utils.py new file mode 100644 index 0000000..1b07fdb --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/utils.py @@ -0,0 +1,38 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utility functions for ``nvitop-exporter``.""" + +import socket + + +__all__ = ['get_ip_address'] + + +# Reference: https://stackoverflow.com/a/28950776 +def get_ip_address() -> str: + """Get the IP address of the current machine.""" + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0.0) + try: + # Doesn't even have to be reachable + s.connect(('10.254.254.254', 1)) + ip_address = s.getsockname()[0] + except Exception: # noqa: BLE001 # pylint: disable=broad-except + ip_address = '127.0.0.1' + finally: + s.close() + return ip_address diff --git a/nvitop-exporter/nvitop_exporter/version.py b/nvitop-exporter/nvitop_exporter/version.py new file mode 100644 index 0000000..4ad76cc --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/version.py @@ -0,0 +1,54 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +__version__ = '1.2.0' +__license__ = 'GPLv3' +__author__ = __maintainer__ = 'Xuehai Pan' +__email__ = 'XuehaiPan@pku.edu.cn' +__release__ = False + +if not __release__: + import os + import subprocess + + try: + prefix, sep, suffix = ( + subprocess.check_output( + ['git', 'describe', '--abbrev=7'], # noqa: S603,S607 + cwd=os.path.dirname(os.path.abspath(__file__)), + stderr=subprocess.DEVNULL, + text=True, + ) + .strip() + .lstrip('v') + .replace('-', '.dev', 1) + .replace('-', '+', 1) + .partition('.dev') + ) + if sep: + version_prefix, dot, version_tail = prefix.rpartition('.') + prefix = f'{version_prefix}{dot}{int(version_tail) + 1}' + __version__ = sep.join((prefix, suffix)) + del version_prefix, dot, version_tail + else: + __version__ = prefix + del prefix, sep, suffix + except (OSError, subprocess.CalledProcessError): + pass + + del os, subprocess diff --git a/nvitop-exporter/pyproject.toml b/nvitop-exporter/pyproject.toml new file mode 100644 index 0000000..205369d --- /dev/null +++ b/nvitop-exporter/pyproject.toml @@ -0,0 +1,86 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "nvitop-exporter" +description = "Prometheus exporter built on top of `nvitop`." +readme = "README.md" +requires-python = ">= 3.7" +authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }] +license = { text = "Apache License, Version 2.0 (Apache-2.0)" } +keywords = [ + "nvidia", + "nvidia-smi", + "NVIDIA", + "NVML", + "CUDA", + "GPU", + "top", + "monitoring", + "prometheus", + "Prometheus", + "grafana", + "Grafana", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Environment :: GPU", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: Console", + "Environment :: Console :: Curses", + "Intended Audience :: Developers", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: System Administrators", + "Topic :: System :: Hardware", + "Topic :: System :: Monitoring", + "Topic :: System :: Systems Administration", + "Topic :: Utilities", +] +dependencies = [ + # Sync with nvitop/version.py and requirements.txt + "nvitop == 1.2.0", + "prometheus-client", +] +dynamic = ["version"] + +[project.scripts] +nvitop-exporter = "nvitop_exporter.cli:main" + +[project.urls] +Homepage = "https://github.com/XuehaiPan/nvitop" +Repository = "https://github.com/XuehaiPan/nvitop" +Documentation = "https://nvitop.readthedocs.io" +"Bug Report" = "https://github.com/XuehaiPan/nvitop/issues" + +[tool.setuptools.packages.find] +include = ["nvitop_exporter", "nvitop_exporter.*"] + +[tool.black] +safe = true +line-length = 100 +skip-string-normalization = true +target-version = ["py37", "py38", "py39", "py310", "py311"] + +[tool.isort] +atomic = true +profile = "black" +src_paths = ["nvitop_exporter"] +known_first_party = ["nvitop", "nvitop_exporter"] +indent = 4 +line_length = 100 +lines_after_imports = 2 +multi_line_output = 3 + +[tool.ruff] +extend = "../pyproject.toml" diff --git a/nvitop-exporter/requirements.txt b/nvitop-exporter/requirements.txt new file mode 100644 index 0000000..92ff8d6 --- /dev/null +++ b/nvitop-exporter/requirements.txt @@ -0,0 +1,3 @@ +# Sync with pyproject.toml and nvitop/version.py +nvitop == 1.2.0 +prometheus-client diff --git a/nvitop-exporter/setup.py b/nvitop-exporter/setup.py new file mode 100755 index 0000000..9104bc7 --- /dev/null +++ b/nvitop-exporter/setup.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +"""Setup script for ``nvitop-exporter``.""" + +import pathlib +import re +import sys + +from setuptools import setup + + +HERE = pathlib.Path(__file__).absolute().parent +VERSION_FILE = HERE / 'nvitop_exporter' / 'version.py' + +sys.path.insert(0, str(VERSION_FILE.parent)) +# pylint: disable-next=import-error,wrong-import-position +import version # noqa + + +VERSION_CONTENT = None + +try: + if not version.__release__: + try: + VERSION_CONTENT = VERSION_FILE.read_text(encoding='utf-8') + VERSION_FILE.write_text( + data=re.sub( + r"""__version__\s*=\s*('[^']+'|"[^"]+")""", + f'__version__ = {version.__version__!r}', + string=VERSION_CONTENT, + ), + encoding='utf-8', + ) + except OSError: + VERSION_CONTENT = None + + setup( + name='nvitop-exporter', + version=version.__version__, + ) +finally: + if VERSION_CONTENT is not None: + with VERSION_FILE.open(mode='wt', encoding='utf-8', newline='') as file: + file.write(VERSION_CONTENT) diff --git a/nvitop/api/__init__.py b/nvitop/api/__init__.py index 25227c2..fd4e814 100644 --- a/nvitop/api/__init__.py +++ b/nvitop/api/__init__.py @@ -29,18 +29,37 @@ from nvitop.api.device import ( ) from nvitop.api.libnvml import NVMLError, nvmlCheckReturn from nvitop.api.process import GpuProcess, HostProcess, command_join -from nvitop.api.utils import * # noqa: F403 +from nvitop.api.utils import ( # explicitly export these to appease mypy + NA, + SIZE_UNITS, + UINT_MAX, + ULONGLONG_MAX, + GiB, + KiB, + MiB, + NaType, + NotApplicable, + NotApplicableType, + PiB, + Snapshot, + TiB, + boolify, + bytes2human, + colored, + human2bytes, + set_color, + timedelta2human, + utilization2string, +) __all__ = [ - 'take_snapshots', - 'collect_in_background', - 'ResourceMetricCollector', - 'libnvml', - 'nvmlCheckReturn', 'NVMLError', + 'nvmlCheckReturn', + 'libnvml', 'libcuda', 'libcudart', + # nvitop.api.device 'Device', 'PhysicalDevice', 'MigDevice', @@ -48,9 +67,34 @@ __all__ = [ 'CudaMigDevice', 'parse_cuda_visible_devices', 'normalize_cuda_visible_devices', + # nvitop.api.process 'host', 'HostProcess', 'GpuProcess', 'command_join', - *utils.__all__, + # nvitop.api.collector + 'take_snapshots', + 'collect_in_background', + 'ResourceMetricCollector', + # nvitop.api.utils + 'NA', + 'NaType', + 'NotApplicable', + 'NotApplicableType', + 'UINT_MAX', + 'ULONGLONG_MAX', + 'KiB', + 'MiB', + 'GiB', + 'TiB', + 'PiB', + 'SIZE_UNITS', + 'bytes2human', + 'human2bytes', + 'timedelta2human', + 'utilization2string', + 'colored', + 'set_color', + 'boolify', + 'Snapshot', ] diff --git a/nvitop/api/device.py b/nvitop/api/device.py index 497d8b5..ae41fe3 100644 --- a/nvitop/api/device.py +++ b/nvitop/api/device.py @@ -1154,7 +1154,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me gpu_percent = gpu_utilization # in percentage - def memory_utilization(self) -> float | NaType: # in percentage + def memory_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. @@ -1170,7 +1170,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me """ # pylint: disable=line-too-long return self.utilization_rates().memory - def encoder_utilization(self) -> float | NaType: # in percentage + def encoder_utilization(self) -> int | NaType: # in percentage """The encoder utilization rate in percentage. Returns: Union[int, NaType] @@ -1178,7 +1178,7 @@ class Device: # pylint: disable=too-many-instance-attributes,too-many-public-me """ return self.utilization_rates().encoder - def decoder_utilization(self) -> float | NaType: # in percentage\ + def decoder_utilization(self) -> int | NaType: # in percentage """The decoder utilization rate in percentage. Returns: Union[int, NaType] diff --git a/nvitop/api/utils.py b/nvitop/api/utils.py index 8b5b909..d0ac276 100644 --- a/nvitop/api/utils.py +++ b/nvitop/api/utils.py @@ -730,10 +730,11 @@ def memoize_when_activated(method: Method) -> Method: """ @functools.wraps(method) - def wrapped(self, *args, **kwargs): # noqa: ANN001,ANN002,ANN003,ANN202 + def wrapped(self: object, *args: Any, **kwargs: Any) -> Any: try: # case 1: we previously entered oneshot() ctx - ret = self._cache[method] # pylint: disable=protected-access + # pylint: disable-next=protected-access + ret = self._cache[method] # type: ignore[attr-defined] except AttributeError: # case 2: we never entered oneshot() ctx return method(self, *args, **kwargs) @@ -742,25 +743,28 @@ def memoize_when_activated(method: Method) -> Method: # for this entry yet ret = method(self, *args, **kwargs) try: - self._cache[method] = ret # pylint: disable=protected-access + # pylint: disable-next=protected-access + self._cache[method] = ret # type: ignore[attr-defined] except AttributeError: # multi-threading race condition, see: # https://github.com/giampaolo/psutil/issues/1948 pass return ret - def cache_activate(self): # noqa: ANN001,ANN202 + def cache_activate(self: object) -> None: """Activate cache. Expects an instance. Cache will be stored as a "_cache" instance attribute. """ if not hasattr(self, '_cache'): - self._cache = {} # pylint: disable=protected-access + # pylint: disable-next=protected-access + self._cache = {} # type: ignore[attr-defined] - def cache_deactivate(self): # noqa: ANN001,ANN202 + def cache_deactivate(self: object) -> None: """Deactivate and clear cache.""" try: - del self._cache # pylint: disable=protected-access + # pylint: disable-next=protected-access + del self._cache # type: ignore[attr-defined] except AttributeError: pass diff --git a/nvitop/cli.py b/nvitop/cli.py index bbb959d..61d856e 100644 --- a/nvitop/cli.py +++ b/nvitop/cli.py @@ -24,7 +24,7 @@ NVITOP_MONITOR_MODE = set( # pylint: disable=too-many-branches,too-many-statements def parse_arguments() -> argparse.Namespace: - """Parse command-line arguments for ``nvtiop``.""" + """Parse command-line arguments for ``nvitop``.""" coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format( colored('light', 'green'), colored('moderate', 'yellow'), diff --git a/pyproject.toml b/pyproject.toml index f7eec77..9dd8f96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,8 @@ target-version = ["py37", "py38", "py39", "py310", "py311"] [tool.isort] atomic = true profile = "black" -src_paths = ["nvitop"] +src_paths = ["nvitop", "nvitop-exporter/nvitop_exporter"] +known_first_party = ["nvitop", "nvitop_exporter"] indent = 4 line_length = 100 lines_after_imports = 2 @@ -85,14 +86,16 @@ multi_line_output = 3 [tool.mypy] # Sync with requires-python python_version = 3.8 # appease mypy for syntax errors in numpy stubs +mypy_path = [".", "nvitop-exporter"] +exclude = ["nvitop-exporter/setup.py"] pretty = true show_error_codes = true show_error_context = true show_traceback = true allow_redefinition = true check_untyped_defs = true -disallow_incomplete_defs = false -disallow_untyped_defs = false +disallow_incomplete_defs = true +disallow_untyped_defs = true ignore_missing_imports = true no_implicit_optional = true strict_equality = true @@ -119,7 +122,7 @@ ignore-words = "docs/source/spelling_wordlist.txt" target-version = "py37" line-length = 100 show-source = true -src = ["nvitop"] +src = ["nvitop", "nvitop-exporter/nvitop_exporter"] select = [ "E", "W", # pycodestyle "F", # pyflakes