fix(callbacks/lightning): populate callback for lightning (#114)

This commit is contained in:
Xuehai Pan 2023-12-17 19:13:19 +08:00 committed by GitHub
parent b50b83767b
commit bff355bcc4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 203 additions and 17 deletions

View file

@ -25,16 +25,16 @@ repos:
- id: debug-statements
- id: double-quote-string-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.7
rev: v0.1.8
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/PyCQA/isort
rev: 5.13.0
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 23.11.0
rev: 23.12.0
hooks:
- id: black
- repo: https://github.com/asottile/pyupgrade

View file

@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114).
- Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).
### Changed

View file

@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback])
**NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback.
#### Callback for [PyTorch Lightning](https://pytorchlightning.ai)
#### Callback for [PyTorch Lightning](https://lightning.ai)
```python
from pytorch_lightning import Trainer
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger
from lightning.pytorch import Trainer
from nvitop.callbacks.lightning import GpuStatsLogger
gpu_stats = GpuStatsLogger()
trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
```

View file

@ -4,5 +4,6 @@ sphinx-autobuild
sphinx-copybutton
sphinx-rtd-theme
lightning >= 2.0.0, < 3.0.0a0
pytorch-lightning >= 1.5.0, < 2.0.0a0
tensorflow-cpu >= 2.0.0, < 2.12.0a0

View file

@ -12,6 +12,14 @@ nvitop.callbacks.keras module
:undoc-members:
:show-inheritance:
nvitop.callbacks.lightning module
---------------------------------
.. automodule:: nvitop.callbacks.lightning
:members:
:undoc-members:
:show-inheritance:
nvitop.callbacks.pytorch\_lightning module
------------------------------------------

View file

@ -153,3 +153,4 @@ ThroughputInfo
pytorch
api
utils
GpuStatsLogger

View file

@ -23,15 +23,14 @@ from __future__ import annotations
import re
import time
from tensorflow.python.keras.callbacks import ( # pylint: disable=import-error,no-name-in-module
Callback,
)
# pylint: disable-next=import-error,no-name-in-module
from tensorflow.python.keras.callbacks import Callback
from nvitop.api import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.

View file

@ -15,7 +15,171 @@
# limitations under the License.
# ==============================================================================
# pylint: disable=missing-module-docstring
# pylint: disable=missing-module-docstring,missing-function-docstring
# pylint: disable=unused-argument,attribute-defined-outside-init
# pylint: disable-next=unused-import
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger # noqa: F401
from __future__ import annotations
import time
from typing import Any
import lightning.pytorch as pl # pylint: disable=import-error
from lightning.pytorch.callbacks import Callback # pylint: disable=import-error
from lightning.pytorch.utilities import rank_zero_only # pylint: disable=import-error
from lightning.pytorch.utilities.exceptions import ( # pylint: disable=import-error
MisconfigurationException,
)
from nvitop.api import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a logger in the ``Trainer``.
Args:
memory_utilization (bool):
Set to :data:`True` to log used, free and the percentage of memory utilization at the
start and end of each step. Default: :data:`True`.
gpu_utilization (bool):
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
each step. Default: :data:`True`.
intra_step_time (bool):
Set to :data:`True` to log the time of each step. Default: :data:`False`.
inter_step_time (bool):
Set to :data:`True` to log the time between the end of one step and the start of the
next step. Default: :data:`False`.
fan_speed (bool):
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
temperature (bool):
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
Raises:
MisconfigurationException:
If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
Examples:
>>> from lightning.pytorch import Trainer
>>> from nvitop.callbacks.lightning import GpuStatsLogger
>>> gpu_stats = GpuStatsLogger()
>>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
via fans in the surrounding enclosure.
- **memory.used** - Total memory allocated by active contexts, in MiBs.
- **memory.free** - Total free memory, in MiBs.
- **utilization.gpu** - Percent of time over the past sample period during which one or more
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global
(device) memory was being read or written. The sample period may be between 1 second and 1/6
second depending on the product.
- **temperature** - Core GPU temperature, in degrees C.
"""
def __init__( # pylint: disable=too-many-arguments
self,
memory_utilization: bool = True,
gpu_utilization: bool = True,
intra_step_time: bool = False,
inter_step_time: bool = False,
fan_speed: bool = False,
temperature: bool = False,
) -> None:
super().__init__()
try:
libnvml.nvmlInit()
except libnvml.NVMLError as ex:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.',
) from ex
self._memory_utilization = memory_utilization
self._gpu_utilization = gpu_utilization
self._intra_step_time = intra_step_time
self._inter_step_time = inter_step_time
self._fan_speed = fan_speed
self._temperature = temperature
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
if not trainer.logger:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
)
if trainer.strategy.root_device.type != 'cuda':
raise MisconfigurationException(
f'You are using GpuStatsLogger but are not running on GPU. '
f'The root device type is {trainer.strategy.root_device.type}.',
)
device_ids = trainer.device_ids
try:
self._devices = get_devices_by_logical_ids(device_ids, unique=True)
except (libnvml.NVMLError, RuntimeError) as ex:
raise ValueError(
f'Cannot use GpuStatsLogger callback because devices unavailable. '
f'Received: `gpus={device_ids}`',
) from ex
def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
self._snap_intra_step_time = None
self._snap_inter_step_time = None
@rank_zero_only
def on_train_batch_start( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._intra_step_time:
self._snap_intra_step_time = time.monotonic()
logs = self._get_gpu_stats()
if self._inter_step_time and self._snap_inter_step_time:
# First log at beginning of second step
logs['batch_time/inter_step (ms)'] = 1000.0 * (
time.monotonic() - self._snap_inter_step_time
)
trainer.logger.log_metrics(logs, step=trainer.global_step)
@rank_zero_only
def on_train_batch_end( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._inter_step_time:
self._snap_inter_step_time = time.monotonic()
logs = self._get_gpu_stats()
if self._intra_step_time and self._snap_intra_step_time:
logs['batch_time/intra_step (ms)'] = 1000.0 * (
time.monotonic() - self._snap_intra_step_time
)
trainer.logger.log_metrics(logs, step=trainer.global_step)
def _get_gpu_stats(self) -> dict[str, float]:
"""Get the gpu status from NVML queries."""
return get_gpu_stats(
devices=self._devices,
memory_utilization=self._memory_utilization,
gpu_utilization=self._gpu_utilization,
fan_speed=self._fan_speed,
temperature=self._temperature,
)

View file

@ -21,7 +21,9 @@
from __future__ import annotations
import time
from typing import Any
import pytorch_lightning as pl # pylint: disable=import-error
from pytorch_lightning.callbacks import Callback # pylint: disable=import-error
from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error
from pytorch_lightning.utilities.exceptions import ( # pylint: disable=import-error
@ -107,7 +109,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
self._fan_speed = fan_speed
self._temperature = temperature
def on_train_start(self, trainer, pl_module) -> None:
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
if not trainer.logger:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
@ -132,12 +134,17 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
f'Received: `gpus={device_ids}`',
) from ex
def on_train_epoch_start(self, trainer, pl_module) -> None:
def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
self._snap_intra_step_time = None
self._snap_inter_step_time = None
@rank_zero_only
def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
def on_train_batch_start( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._intra_step_time:
self._snap_intra_step_time = time.monotonic()
@ -152,7 +159,12 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
trainer.logger.log_metrics(logs, step=trainer.global_step)
@rank_zero_only
def on_train_batch_end(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
def on_train_batch_end( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._inter_step_time:
self._snap_inter_step_time = time.monotonic()