Source code for cerebras.modelzoo.trainer.callbacks.loss

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module contains the CheckLoss callback."""

from math import prod

import torch

import cerebras.pytorch as cstorch
from cerebras.modelzoo.common.half_dtype import cb16_to_fp32
from cerebras.modelzoo.trainer.callbacks import Callback


[docs]class CheckLoss(Callback):
    """Callback class that checks for NaN or inf loss values.

    It also checks whether the model output contains a scalar loss value.
    """

[docs]    def on_after_forward(self, trainer, model, outputs, batch):
        if "loss" in outputs:
            loss = outputs["loss"]

            if not isinstance(loss, torch.Tensor):
                raise TypeError(
                    f"Expected loss to be a scalar torch.Tensor, "
                    f"but got {type(loss)} instead."
                )
            elif prod(loss.shape) > 1:
                raise TypeError(
                    f"Expected loss to be a scalar torch.Tensor, "
                    f"but got tensor with shape {loss.shape} instead."
                )

[docs]    @cstorch.step_closure
    def check_loss(self, loss: torch.Tensor):  # pylint: disable=no-self-use
        """Checks for NaN or inf loss values.

        Args:
            loss: Scalar loss tensor.
        """
        msg_postfix = (
            "This could potentially be due to selected hyperparameters "
            "such as the learning rate, batch size, etc. or it could due "
            "an internal error. Please try with different set of "
            "hyperparameters and contact Cerebras Support if the issue "
            "persists."
        )

        from cerebras.appliance.errors import ApplianceNanError

        if torch.isnan(loss).any().item():
            raise ApplianceNanError(f"NaN loss detected. {msg_postfix}")
        if torch.isinf(loss).any().item():
            raise ApplianceNanError(f"inf loss detected. {msg_postfix}")

[docs]    def on_train_batch_end(self, trainer, model, outputs, batch, batch_idx):
        if "loss" in outputs:
            loss = cb16_to_fp32(outputs["loss"])
            self.check_loss(loss)

[docs]    def on_validate_batch_end(self, trainer, model, outputs, batch, batch_idx):
        if "loss" in outputs:
            loss = cb16_to_fp32(outputs["loss"])
            self.check_loss(loss)