mirror of
https://github.com/XuehaiPan/nvitop.git
synced 2026-05-15 14:15:55 -06:00
fix(callbacks/lightning): populate callback for lightning (#114)
This commit is contained in:
parent
b50b83767b
commit
bff355bcc4
9 changed files with 203 additions and 17 deletions
|
|
@ -25,16 +25,16 @@ repos:
|
|||
- id: debug-statements
|
||||
- id: double-quote-string-fixer
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.1.7
|
||||
rev: v0.1.8
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix]
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.13.0
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.11.0
|
||||
rev: 23.12.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
### Added
|
||||
|
||||
- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114).
|
||||
- Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
|
|
@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback])
|
|||
|
||||
**NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback.
|
||||
|
||||
#### Callback for [PyTorch Lightning](https://pytorchlightning.ai)
|
||||
#### Callback for [PyTorch Lightning](https://lightning.ai)
|
||||
|
||||
```python
|
||||
from pytorch_lightning import Trainer
|
||||
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger
|
||||
from lightning.pytorch import Trainer
|
||||
from nvitop.callbacks.lightning import GpuStatsLogger
|
||||
gpu_stats = GpuStatsLogger()
|
||||
trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
|
||||
```
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@ sphinx-autobuild
|
|||
sphinx-copybutton
|
||||
sphinx-rtd-theme
|
||||
|
||||
lightning >= 2.0.0, < 3.0.0a0
|
||||
pytorch-lightning >= 1.5.0, < 2.0.0a0
|
||||
tensorflow-cpu >= 2.0.0, < 2.12.0a0
|
||||
|
|
|
|||
|
|
@ -12,6 +12,14 @@ nvitop.callbacks.keras module
|
|||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
nvitop.callbacks.lightning module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: nvitop.callbacks.lightning
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
nvitop.callbacks.pytorch\_lightning module
|
||||
------------------------------------------
|
||||
|
||||
|
|
|
|||
|
|
@ -153,3 +153,4 @@ ThroughputInfo
|
|||
pytorch
|
||||
api
|
||||
utils
|
||||
GpuStatsLogger
|
||||
|
|
|
|||
|
|
@ -23,15 +23,14 @@ from __future__ import annotations
|
|||
import re
|
||||
import time
|
||||
|
||||
from tensorflow.python.keras.callbacks import ( # pylint: disable=import-error,no-name-in-module
|
||||
Callback,
|
||||
)
|
||||
# pylint: disable-next=import-error,no-name-in-module
|
||||
from tensorflow.python.keras.callbacks import Callback
|
||||
|
||||
from nvitop.api import libnvml
|
||||
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
|
||||
|
||||
|
||||
# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
|
||||
# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras
|
||||
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
||||
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
|
||||
in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.
|
||||
|
|
|
|||
|
|
@ -15,7 +15,171 @@
|
|||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
# pylint: disable=missing-module-docstring
|
||||
# pylint: disable=missing-module-docstring,missing-function-docstring
|
||||
# pylint: disable=unused-argument,attribute-defined-outside-init
|
||||
|
||||
# pylint: disable-next=unused-import
|
||||
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger # noqa: F401
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import lightning.pytorch as pl # pylint: disable=import-error
|
||||
from lightning.pytorch.callbacks import Callback # pylint: disable=import-error
|
||||
from lightning.pytorch.utilities import rank_zero_only # pylint: disable=import-error
|
||||
from lightning.pytorch.utilities.exceptions import ( # pylint: disable=import-error
|
||||
MisconfigurationException,
|
||||
)
|
||||
|
||||
from nvitop.api import libnvml
|
||||
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
|
||||
|
||||
|
||||
# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
|
||||
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
||||
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
|
||||
in order to use it you need to assign a logger in the ``Trainer``.
|
||||
|
||||
Args:
|
||||
memory_utilization (bool):
|
||||
Set to :data:`True` to log used, free and the percentage of memory utilization at the
|
||||
start and end of each step. Default: :data:`True`.
|
||||
gpu_utilization (bool):
|
||||
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
|
||||
each step. Default: :data:`True`.
|
||||
intra_step_time (bool):
|
||||
Set to :data:`True` to log the time of each step. Default: :data:`False`.
|
||||
inter_step_time (bool):
|
||||
Set to :data:`True` to log the time between the end of one step and the start of the
|
||||
next step. Default: :data:`False`.
|
||||
fan_speed (bool):
|
||||
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
|
||||
temperature (bool):
|
||||
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
|
||||
|
||||
Raises:
|
||||
MisconfigurationException:
|
||||
If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
|
||||
|
||||
Examples:
|
||||
>>> from lightning.pytorch import Trainer
|
||||
>>> from nvitop.callbacks.lightning import GpuStatsLogger
|
||||
>>> gpu_stats = GpuStatsLogger()
|
||||
>>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
|
||||
|
||||
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
|
||||
|
||||
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
|
||||
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
|
||||
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
|
||||
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
|
||||
via fans in the surrounding enclosure.
|
||||
- **memory.used** - Total memory allocated by active contexts, in MiBs.
|
||||
- **memory.free** - Total free memory, in MiBs.
|
||||
- **utilization.gpu** - Percent of time over the past sample period during which one or more
|
||||
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
|
||||
depending on the product.
|
||||
- **utilization.memory** - Percent of time over the past sample period during which global
|
||||
(device) memory was being read or written. The sample period may be between 1 second and 1/6
|
||||
second depending on the product.
|
||||
- **temperature** - Core GPU temperature, in degrees C.
|
||||
"""
|
||||
|
||||
def __init__( # pylint: disable=too-many-arguments
|
||||
self,
|
||||
memory_utilization: bool = True,
|
||||
gpu_utilization: bool = True,
|
||||
intra_step_time: bool = False,
|
||||
inter_step_time: bool = False,
|
||||
fan_speed: bool = False,
|
||||
temperature: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
try:
|
||||
libnvml.nvmlInit()
|
||||
except libnvml.NVMLError as ex:
|
||||
raise MisconfigurationException(
|
||||
'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.',
|
||||
) from ex
|
||||
|
||||
self._memory_utilization = memory_utilization
|
||||
self._gpu_utilization = gpu_utilization
|
||||
self._intra_step_time = intra_step_time
|
||||
self._inter_step_time = inter_step_time
|
||||
self._fan_speed = fan_speed
|
||||
self._temperature = temperature
|
||||
|
||||
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
|
||||
if not trainer.logger:
|
||||
raise MisconfigurationException(
|
||||
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
|
||||
)
|
||||
|
||||
if trainer.strategy.root_device.type != 'cuda':
|
||||
raise MisconfigurationException(
|
||||
f'You are using GpuStatsLogger but are not running on GPU. '
|
||||
f'The root device type is {trainer.strategy.root_device.type}.',
|
||||
)
|
||||
|
||||
device_ids = trainer.device_ids
|
||||
|
||||
try:
|
||||
self._devices = get_devices_by_logical_ids(device_ids, unique=True)
|
||||
except (libnvml.NVMLError, RuntimeError) as ex:
|
||||
raise ValueError(
|
||||
f'Cannot use GpuStatsLogger callback because devices unavailable. '
|
||||
f'Received: `gpus={device_ids}`',
|
||||
) from ex
|
||||
|
||||
def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
|
||||
self._snap_intra_step_time = None
|
||||
self._snap_inter_step_time = None
|
||||
|
||||
@rank_zero_only
|
||||
def on_train_batch_start( # pylint: disable=arguments-differ
|
||||
self,
|
||||
trainer: pl.Trainer,
|
||||
pl_module: pl.LightningModule,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if self._intra_step_time:
|
||||
self._snap_intra_step_time = time.monotonic()
|
||||
|
||||
logs = self._get_gpu_stats()
|
||||
|
||||
if self._inter_step_time and self._snap_inter_step_time:
|
||||
# First log at beginning of second step
|
||||
logs['batch_time/inter_step (ms)'] = 1000.0 * (
|
||||
time.monotonic() - self._snap_inter_step_time
|
||||
)
|
||||
|
||||
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
||||
|
||||
@rank_zero_only
|
||||
def on_train_batch_end( # pylint: disable=arguments-differ
|
||||
self,
|
||||
trainer: pl.Trainer,
|
||||
pl_module: pl.LightningModule,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if self._inter_step_time:
|
||||
self._snap_inter_step_time = time.monotonic()
|
||||
|
||||
logs = self._get_gpu_stats()
|
||||
|
||||
if self._intra_step_time and self._snap_intra_step_time:
|
||||
logs['batch_time/intra_step (ms)'] = 1000.0 * (
|
||||
time.monotonic() - self._snap_intra_step_time
|
||||
)
|
||||
|
||||
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
||||
|
||||
def _get_gpu_stats(self) -> dict[str, float]:
|
||||
"""Get the gpu status from NVML queries."""
|
||||
return get_gpu_stats(
|
||||
devices=self._devices,
|
||||
memory_utilization=self._memory_utilization,
|
||||
gpu_utilization=self._gpu_utilization,
|
||||
fan_speed=self._fan_speed,
|
||||
temperature=self._temperature,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,7 +21,9 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import pytorch_lightning as pl # pylint: disable=import-error
|
||||
from pytorch_lightning.callbacks import Callback # pylint: disable=import-error
|
||||
from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error
|
||||
from pytorch_lightning.utilities.exceptions import ( # pylint: disable=import-error
|
||||
|
|
@ -107,7 +109,7 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
self._fan_speed = fan_speed
|
||||
self._temperature = temperature
|
||||
|
||||
def on_train_start(self, trainer, pl_module) -> None:
|
||||
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
|
||||
if not trainer.logger:
|
||||
raise MisconfigurationException(
|
||||
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
|
||||
|
|
@ -132,12 +134,17 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
f'Received: `gpus={device_ids}`',
|
||||
) from ex
|
||||
|
||||
def on_train_epoch_start(self, trainer, pl_module) -> None:
|
||||
def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
|
||||
self._snap_intra_step_time = None
|
||||
self._snap_inter_step_time = None
|
||||
|
||||
@rank_zero_only
|
||||
def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
|
||||
def on_train_batch_start( # pylint: disable=arguments-differ
|
||||
self,
|
||||
trainer: pl.Trainer,
|
||||
pl_module: pl.LightningModule,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if self._intra_step_time:
|
||||
self._snap_intra_step_time = time.monotonic()
|
||||
|
||||
|
|
@ -152,7 +159,12 @@ class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
|
|||
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
||||
|
||||
@rank_zero_only
|
||||
def on_train_batch_end(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
|
||||
def on_train_batch_end( # pylint: disable=arguments-differ
|
||||
self,
|
||||
trainer: pl.Trainer,
|
||||
pl_module: pl.LightningModule,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if self._inter_step_time:
|
||||
self._snap_inter_step_time = time.monotonic()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue