fix(callbacks/lightning): populate callback for lightning (#114)

2026-05-15 14:15:55 -06:00 · 2023-12-17 19:13:19 +08:00 · 2023-12-17 19:13:19 +08:00 · bff355bcc4
commit bff355bcc4
parent b50b83767b
9 changed files with 203 additions and 17 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -25,16 +25,16 @@ repos:
      - id: debug-statements
      - id: double-quote-string-fixer
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.7
+    rev: v0.1.8
    hooks:
      - id: ruff
        args: [--fix, --exit-non-zero-on-fix]
  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.0
+    rev: 5.13.2
    hooks:
      - id: isort
  - repo: https://github.com/psf/black
-    rev: 23.11.0
+    rev: 23.12.0
    hooks:
      - id: black
  - repo: https://github.com/asottile/pyupgrade
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114).
 - Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).

 ### Changed
--- a/README.md
+++ b/README.md
@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback])

 **NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback.

-#### Callback for [PyTorch Lightning](https://pytorchlightning.ai)
+#### Callback for [PyTorch Lightning](https://lightning.ai)

 ```python
-from pytorch_lightning import Trainer
-from nvitop.callbacks.pytorch_lightning import GpuStatsLogger
+from lightning.pytorch import Trainer
+from nvitop.callbacks.lightning import GpuStatsLogger
 gpu_stats = GpuStatsLogger()
 trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
 ```
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -4,5 +4,6 @@ sphinx-autobuild
 sphinx-copybutton
 sphinx-rtd-theme

+lightning >= 2.0.0, < 3.0.0a0
 pytorch-lightning >= 1.5.0, < 2.0.0a0
 tensorflow-cpu >= 2.0.0, < 2.12.0a0
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@ -12,6 +12,14 @@ nvitop.callbacks.keras module
    :undoc-members:
    :show-inheritance:

+nvitop.callbacks.lightning module
+---------------------------------
+
+.. automodule:: nvitop.callbacks.lightning
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 nvitop.callbacks.pytorch\_lightning module
 ------------------------------------------

--- a/docs/source/spelling_wordlist.txt
+++ b/docs/source/spelling_wordlist.txt
@ -153,3 +153,4 @@ ThroughputInfo
 pytorch
 api
 utils
+GpuStatsLogger
--- a/nvitop/callbacks/keras.py
+++ b/nvitop/callbacks/keras.py
@ -23,15 +23,14 @@ from __future__ import annotations
 import re
 import time

-from tensorflow.python.keras.callbacks import (  # pylint: disable=import-error,no-name-in-module
-    Callback,
-)
+# pylint: disable-next=import-error,no-name-in-module
+from tensorflow.python.keras.callbacks import Callback

 from nvitop.api import libnvml
 from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats


-# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
+# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras
 class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
    """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
    in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.
--- a/nvitop/callbacks/lightning.py
+++ b/nvitop/callbacks/lightning.py
@ -15,7 +15,171 @@
 # limitations under the License.
 # ==============================================================================

-# pylint: disable=missing-module-docstring
+# pylint: disable=missing-module-docstring,missing-function-docstring
+# pylint: disable=unused-argument,attribute-defined-outside-init

-# pylint: disable-next=unused-import
-from nvitop.callbacks.pytorch_lightning import GpuStatsLogger  # noqa: F401
+from __future__ import annotations
+
+import time
+from typing import Any
+
+import lightning.pytorch as pl  # pylint: disable=import-error
+from lightning.pytorch.callbacks import Callback  # pylint: disable=import-error
+from lightning.pytorch.utilities import rank_zero_only  # pylint: disable=import-error
+from lightning.pytorch.utilities.exceptions import (  # pylint: disable=import-error
+    MisconfigurationException,
+)
+
+from nvitop.api import libnvml
+from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
+
+
+# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
+class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
+    """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
+    in order to use it you need to assign a logger in the ``Trainer``.
+
+    Args:
+        memory_utilization (bool):
+            Set to :data:`True` to log used, free and the percentage of memory utilization at the
+            start and end of each step. Default: :data:`True`.
+        gpu_utilization (bool):
+            Set to :data:`True` to log the percentage of GPU utilization at the start and end of
+            each step. Default: :data:`True`.
+        intra_step_time (bool):
+            Set to :data:`True` to log the time of each step. Default: :data:`False`.
+        inter_step_time (bool):
+            Set to :data:`True` to log the time between the end of one step and the start of the
+            next step. Default: :data:`False`.
+        fan_speed (bool):
+            Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
+        temperature (bool):
+            Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
+
+    Raises:
+        MisconfigurationException:
+            If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
+
+    Examples:
+        >>> from lightning.pytorch import Trainer
+        >>> from nvitop.callbacks.lightning import GpuStatsLogger
+        >>> gpu_stats = GpuStatsLogger()
+        >>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
+
+    GPU stats are mainly based on NVML queries. The description of the queries is as follows:
+
+    - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
+      currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
+      intended fan speed. If the fan is physically blocked and unable to spin, this output will not
+      match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
+      via fans in the surrounding enclosure.
+    - **memory.used** - Total memory allocated by active contexts, in MiBs.
+    - **memory.free** - Total free memory, in MiBs.
+    - **utilization.gpu** - Percent of time over the past sample period during which one or more
+      kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
+      depending on the product.
+    - **utilization.memory** - Percent of time over the past sample period during which global
+      (device) memory was being read or written. The sample period may be between 1 second and 1/6
+      second depending on the product.
+    - **temperature** - Core GPU temperature, in degrees C.
+    """
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        memory_utilization: bool = True,
+        gpu_utilization: bool = True,
+        intra_step_time: bool = False,
+        inter_step_time: bool = False,
+        fan_speed: bool = False,
+        temperature: bool = False,
+    ) -> None:
+        super().__init__()
+
+        try:
+            libnvml.nvmlInit()
+        except libnvml.NVMLError as ex:
+            raise MisconfigurationException(
+                'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.',
+            ) from ex
+
+        self._memory_utilization = memory_utilization
+        self._gpu_utilization = gpu_utilization
+        self._intra_step_time = intra_step_time
+        self._inter_step_time = inter_step_time
+        self._fan_speed = fan_speed
+        self._temperature = temperature
+
+    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        if not trainer.logger:
+            raise MisconfigurationException(
+                'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
+            )
+
+        if trainer.strategy.root_device.type != 'cuda':
+            raise MisconfigurationException(
+                f'You are using GpuStatsLogger but are not running on GPU. '
+                f'The root device type is {trainer.strategy.root_device.type}.',
+            )
+
+        device_ids = trainer.device_ids
+
+        try:
+            self._devices = get_devices_by_logical_ids(device_ids, unique=True)
+        except (libnvml.NVMLError, RuntimeError) as ex:
+            raise ValueError(
+                f'Cannot use GpuStatsLogger callback because devices unavailable. '
+                f'Received: `gpus={device_ids}`',
+            ) from ex
+
+    def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        self._snap_intra_step_time = None
+        self._snap_inter_step_time = None
+
+    @rank_zero_only
+    def on_train_batch_start(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
+        if self._intra_step_time:
+            self._snap_intra_step_time = time.monotonic()
+
+        logs = self._get_gpu_stats()
+
+        if self._inter_step_time and self._snap_inter_step_time:
+            # First log at beginning of second step
+            logs['batch_time/inter_step (ms)'] = 1000.0 * (
+                time.monotonic() - self._snap_inter_step_time
+            )
+
+        trainer.logger.log_metrics(logs, step=trainer.global_step)
+
+    @rank_zero_only
+    def on_train_batch_end(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
+        if self._inter_step_time:
+            self._snap_inter_step_time = time.monotonic()
+
+        logs = self._get_gpu_stats()
+
+        if self._intra_step_time and self._snap_intra_step_time:
+            logs['batch_time/intra_step (ms)'] = 1000.0 * (
+                time.monotonic() - self._snap_intra_step_time
+            )
+
+        trainer.logger.log_metrics(logs, step=trainer.global_step)
+
+    def _get_gpu_stats(self) -> dict[str, float]:
+        """Get the gpu status from NVML queries."""
+        return get_gpu_stats(
+            devices=self._devices,
+            memory_utilization=self._memory_utilization,
+            gpu_utilization=self._gpu_utilization,
+            fan_speed=self._fan_speed,
+            temperature=self._temperature,
+        )
--- a/nvitop/callbacks/pytorch_lightning.py
+++ b/nvitop/callbacks/pytorch_lightning.py
@ -21,7 +21,9 @@
 from __future__ import annotations

 import time
+from typing import Any

+import pytorch_lightning as pl  # pylint: disable=import-error
 from pytorch_lightning.callbacks import Callback  # pylint: disable=import-error
 from pytorch_lightning.utilities import rank_zero_only  # pylint: disable=import-error
 from pytorch_lightning.utilities.exceptions import (  # pylint: disable=import-error
@ -107,7 +109,7 @@ class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
        self._fan_speed = fan_speed
        self._temperature = temperature

-    def on_train_start(self, trainer, pl_module) -> None:
+    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
        if not trainer.logger:
            raise MisconfigurationException(
                'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
@ -132,12 +134,17 @@ class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
                f'Received: `gpus={device_ids}`',
            ) from ex

-    def on_train_epoch_start(self, trainer, pl_module) -> None:
+    def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
        self._snap_intra_step_time = None
        self._snap_inter_step_time = None

    @rank_zero_only
-    def on_train_batch_start(self, trainer, **kwargs) -> None:  # pylint: disable=arguments-differ
+    def on_train_batch_start(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
        if self._intra_step_time:
            self._snap_intra_step_time = time.monotonic()

@ -152,7 +159,12 @@ class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
        trainer.logger.log_metrics(logs, step=trainer.global_step)

    @rank_zero_only
-    def on_train_batch_end(self, trainer, **kwargs) -> None:  # pylint: disable=arguments-differ
+    def on_train_batch_end(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
        if self._inter_step_time:
            self._snap_inter_step_time = time.monotonic()