Source code for cerebras.modelzoo.trainer.trainer

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The ModelZoo Trainer class is the main entry point for training models in ModelZoo.
It is responsible for setting up the training environment, running the training/validation loop,
and saving the checkpoint.
"""

from __future__ import annotations

import os
import uuid
from collections import Counter, OrderedDict
from contextlib import ExitStack, contextmanager, nullcontext
from copy import copy
from functools import wraps
from logging import Logger as PythonLogger
from pathlib import Path
from typing import Any, Callable, Dict, Generator, List, Optional, Union, final
from warnings import warn
from weakref import finalize

import torch

import cerebras.pytorch as cstorch
from cerebras.modelzoo.trainer.callbacks import (
    GLOBAL_CALLBACK_REGISTRY,
    ArtifactDirCallback,
    BackendCallback,
    Callback,
    Checkpoint,
    CoreCallback,
    DataLoaderCallback,
    GradientAccumulationCallback,
    Logging,
    LoopCallback,
    ModelCallback,
    OptimizerCallback,
    Precision,
    Reproducibility,
    SchedulersCallback,
    SchedulersInput,
    SparsityCallback,
    TrainingLoop,
    ValidationCallback,
    ValidationLoop,
)
from cerebras.modelzoo.trainer.loggers import Logger
from cerebras.modelzoo.trainer.utils import convert_output_to_dict
from cerebras.pytorch.backend import Backend
from cerebras.pytorch.optim import Optimizer
from cerebras.pytorch.sparse import SparsityAlgorithm


[docs]class Trainer:
    """The Trainer class is the main entry point for training models in ModelZoo."""

    @final
    def __init__(
        self,
        device: Optional[str] = None,
        backend: Optional[Backend] = None,
        model_dir: str = ...,
        model: Union[Callable[[], torch.nn.Module], torch.nn.Module] = ...,
        optimizer: Union[
            Optimizer,
            Callable[[torch.nn.Module], Optimizer],
            None,
        ] = None,
        schedulers: SchedulersInput = None,
        precision: Optional[Precision] = None,
        sparsity: Optional[SparsityAlgorithm] = None,
        # Training args
        loop: Optional[LoopCallback] = None,
        checkpoint: Optional[Checkpoint] = None,
        logging: Optional[Logging] = None,
        # Trainer args
        callbacks: Optional[List[Callback]] = None,
        loggers: Optional[List[Logger]] = None,
        seed: Optional[int] = None,
    ):
        """
        Args:
            device: The device to train the model on. It must be one of "CSX",
                "CPU", or "GPU".

            backend: The backend used to train the model. This argument is mutually
                exclusive with `device`.

            model_dir: The directory where the model artifacts are saved.

            model: The model to train. It must be one of the following:

                - If a callable is passed, it is assumed to be a function that
                  takes in no arguments returns a torch.nn.Module.

                - If a torch.nn.Module is passed, it is used as is.

            optimizer: The optimizer used to optimize the model. It must be one of the following:

                - If a :py:class:`~cerebras.pytorch.optim.Optimizer` is passed, it is used as is.

                - If a callable is passed, it is assumed to be a function that
                  takes in a torch.nn.Module and returns a
                  :py:class:`~cerebras.pytorch.optim.Optimizer`.

                - If not passed, then assume that only validation will be run.

            schedulers: The set of optimizer schedulers to be used. Common schedulers include LR
                schedulers. It must be a list of these items:

                - If a cstorch.optim.scheduler.Scheduler is passed, it is used as is.

                - A callable that is assumed to be a function that takes in a
                  :py:class:`~cerebras.pytorch.optim.Optimizer` and returns a
                  cstorch.optim.scheduler.Scheduler.

                - If None, there is no optimizer param group scheduling.

            precision: The Precision callback used during training

            sparsity: The sparsity algorithm used to sparsify weights during training/validation
                It must be one of the following:

                - If a callable is passed, it is assumed to be a function that
                  takes in no arguments returns a
                  :py:class:`~cerebras.pytorch.sparse.SparsityAlgorithm`.

                - If a :py:class:`~cerebras.pytorch.sparse.SparsityAlgorithm` is
                  passed, it is used as is.

            loop: The loop callback to use for training. It must be an instance of LoopCallback.
                If not provided, the default loop is TrainingLoop(num_epochs=1).

            checkpoint: The checkpoint callback to use for saving/loading checkpoints. It must
                be an instance of Checkpoints. If not provided, then no checkpoints are saved.

            logging: The logging callback used to set up python logging. This callback also
                controls when logs are supposed to be logged. If not provided, the default
                logging settings ``Logging(log_steps=1, log_level="INFO")`` are used.

            callbacks: A list of callbacks to used by the trainer. The order in
                which the callbacks are provided is important as it determines
                the order in which the callback's hooks are executed.

            loggers: A list of loggers to use for logging.

            seed: Initial seed for the torch random number generator.

        """
        super().__init__()

        if model_dir is Ellipsis:
            raise ValueError("model_dir is a required argument")

        self.model_dir = Path(model_dir)

        if model is Ellipsis:
            raise ValueError("model is a required argument")
        if not isinstance(model, torch.nn.Module) and isinstance(
            optimizer, cstorch.optim.Optimizer
        ):
            raise ValueError(
                f"Expected optimizer to be a callable that takes in a torch.nn.Module "
                f"and returns a cstorch.optim.Optimizer. Got: {type(optimizer)}"
            )

        # Attributes set by core callbacks
        self.artifact_dir: Path
        self.summary_dir: Path
        self.backend: "cstorch.backend.Backend"
        self.model: torch.nn.Module
        self.compiled_model: Callable
        self.dataloader: cstorch.utils.data.DataLoader
        self.optimizer: Optional[cstorch.optim.Optimizer]
        self.schedulers: Optional[List[cstorch.optim.scheduler.Scheduler]]
        self.executor: Optional[cstorch.utils.data.DataExecutor] = None
        self.global_step: int

        # Other attributes that callbacks may set
        self.activation_steps: Optional[int] = None

        if precision is not None and not isinstance(precision, Precision):
            raise TypeError(
                f"Expected precision to be an instance of Precision. "
                f"Got: {type(precision)}"
            )
        if loop is None:
            loop = TrainingLoop(num_epochs=1)
        elif not isinstance(loop, LoopCallback):
            raise TypeError(
                f"Expected loop to be an instance of LoopCallback."
                f"Got: {type(loop)}"
            )
        if checkpoint is None:
            checkpoint = Checkpoint()
        elif not isinstance(checkpoint, Checkpoint):
            raise TypeError(
                f"Expected checkpoint to be an instance of Checkpoint. "
                f"Got: {type(checkpoint)}"
            )

        if logging is None:
            logging = Logging(log_steps=1, log_level="INFO")
        elif not isinstance(logging, Logging):
            raise TypeError(
                f"Expected logging to be an instance of Logging. "
                f"Got: {type(logging)}"
            )

        # Order of the core callbacks is important and should not be changed
        self.callbacks = OrderedDict(
            {
                "artifact_dir": ArtifactDirCallback(),
                "reproducibility": Reproducibility(seed),
                "backend": BackendCallback(backend, device),
                "model": ModelCallback(model),
                "dataloader": DataLoaderCallback(),
                "optimizer": OptimizerCallback(optimizer),
                "schedulers": SchedulersCallback(schedulers),
                "precision": precision,
                "sparsity": SparsityCallback(sparsity),
                "loop": loop,
                "logging": logging,
                "grad_accum": GradientAccumulationCallback(),
                "checkpoint": checkpoint,
            }
        )
        # This is a sanity check, all callbacks in the above list should be core
        # callbacks and not user-defined callbacks
        for callback in self.callbacks.values():
            if callback is not None and not isinstance(callback, CoreCallback):
                raise TypeError(
                    f"Found non-core callback in core callback list: {type(callback)}"
                )

        user_callbacks = callbacks or []

        for callback in user_callbacks:
            if isinstance(callback, Logger):
                warn(
                    f"Passed logger {type(callback)} as a `callback`. "
                    f"It will not be used by trainer.log_metrics(). "
                    f"To use for logging metrics, pass it as a `logger` "
                    f"instead."
                )

        self.loggers = loggers or []

        for logger in self.loggers:
            if not isinstance(logger, Logger):
                raise TypeError(
                    f"logger must be an instance of Logger. Got: {type(logger)}"
                )

        counter = Counter(self.callbacks.keys())

        def get_name(callback):
            name = type(callback).__name__
            counter[name] += 1
            if counter[name] > 1:
                return f"{name}_{counter[name]}"
            return name

        for callback in self.loggers + user_callbacks:
            if isinstance(callback, CoreCallback):
                raise ValueError(
                    f"Callback {type(callback).__name__} is a core callback "
                    f"and cannot be overridden"
                )

            self.callbacks[get_name(callback)] = callback

        # Checkpoint should be the last callback as saving a checkpoint is slow
        self.callbacks.move_to_end("checkpoint")

        # Whitelist of non-standard hooks that callbacks can implement
        self.non_standard_hooks_whitelist = set()

        # ID map for validation dataloaders
        self._name_scope_stack = []
        self._val_dataloader_id_map = {}
        self._val_dataloader_id = 0

        self.call("pre_setup")
        self.call("setup")

        def callback_finalize(callbacks):
            for callback in callbacks:
                if callback is not None:
                    callback.finalize()

        self._finalizer = finalize(
            self, callback_finalize, self.callbacks.values()
        )

    @property
    def all_callbacks(self) -> Generator[Callback, None, None]:
        """Get all callback objects available to the trainer."""
        yield from self.callbacks.values()
        yield from GLOBAL_CALLBACK_REGISTRY.values()

[docs]    def get_callbacks(
        self, callback_type: type
    ) -> Generator[Callback, None, None]:
        """Get all callbacks of the given type."""
        for callback in self.all_callbacks:
            if isinstance(callback, callback_type):
                yield callback

[docs]    def get_callback(self, callback_type: type) -> Optional[Callback]:
        """Get the first callback of the given type."""
        return next(self.get_callbacks(callback_type), None)

    @property
    def validation_callbacks(self) -> List[ValidationCallback]:
        """Returns all validation callbacks in the Trainer's callback list."""
        return list(self.get_callbacks(ValidationCallback))

[docs]    def call(self, hook_name: str, *args, **kwargs):
        """
        Call the hook with name hook_name for all callbacks
        in the Trainer's callback list as well as the callbacks
        in the global registry.

        The callback's method is passed in the trainer object itself
        as well as any args and kwargs that are passed into this method. e.g.

        .. code: python

            getattr(callback, hook_name)(self, *args, **kwargs)

        Args:
            hook_name: The name of the hook to call. It must be the name of
                a method in the Callback class.
            args: Other positional arguments to forward
                along to the called hook.
            kwargs: Other keyword arguments to forward
                along to the called hook.
        """
        seen = set()

        for callback in self.all_callbacks:
            if callback is None:
                continue

            if callback in seen:
                warn(f"Duplicate callback found in the list: {callback}")
            else:
                seen.add(callback)

            hook = getattr(callback, hook_name, None)
            if hook:
                try:
                    hook(self, *args, **kwargs)
                except Exception as e:
                    # TODO(SW-128935): Add this to the exception notes once
                    #                  we support python 3.11
                    raise RuntimeError(
                        f"Encountered error when calling "
                        f"{type(callback).__name__}.{hook_name}"
                    ) from e
            elif hook_name not in self.non_standard_hooks_whitelist:
                raise AttributeError(
                    f"Callback {type(callback)} does not implement {hook_name}"
                )

    @property
    def precision(self) -> Optional[Precision]:
        """Returns the precision callback instance if it exists."""
        return self.callbacks["precision"]

    @property
    def grad_accum(self) -> GradientAccumulationCallback:
        """Returns the gradient accumulation callback instance."""
        return self.callbacks["grad_accum"]

    @property
    def should_run_optimizer_step(self) -> bool:
        """Returns True if we should run the optimizer step.

        The gradient accumulation callback may set this to False if we are
        accumulating gradients and have not reached the accumulation steps.
        Note, this only applies to CPU/GPU runs.
        """
        return self.grad_accum.should_run_optimizer_step

    @property
    def loop(self) -> LoopCallback:
        """Returns the default loop settings."""
        return self.callbacks["loop"]

    @property
    def checkpoint(self) -> Checkpoint:
        """Returns the checkpoint callback."""
        return self.callbacks["checkpoint"]

    @property
    def logging(self) -> Checkpoint:
        """Returns the logging callback."""
        return self.callbacks["logging"]

    @property
    def logger(self) -> PythonLogger:
        """Returns the Trainer's Python logger object."""
        return self.logging.logger

    @property
    def is_log_step(self) -> bool:
        """Returns True if the current step is a log step."""
        return self.logging.is_log_step(self)

    @property
    def is_first_iteration(self) -> bool:
        """Returns True if the executor is on its first iteration."""
        return self.executor and self.executor.iteration == 0

    @property
    def is_final_iteration(self) -> bool:
        """Returns True if the executor is on its final iteration."""
        return self.executor and self.executor.on_final_iteration

    @final
    @property
    def is_tracing(self) -> bool:
        """Returns True if we are currently tracing the model."""
        return self.backend.is_tracing

[docs]    @final
    @cstorch.step_closure
    def log_metrics_in_step_closure(self, **kwargs):
        """Log the given kwargs inside a step closure."""
        # This is a sanity check and should never assert
        assert not self.is_tracing
        self.log_metrics(**kwargs)

[docs]    @final
    def log_metrics(self, **kwargs):
        """
        Log the given kwargs to all loggers.

        Example usage:

        .. code:: python

            trainer.log_metrics(loss=loss.item())

        Args:
            kwargs: The key-value pairs to log.
        """
        if self.is_tracing:
            # If we are tracing, log the kwargs inside a step closure
            self.log_metrics_in_step_closure(**kwargs)
        else:
            # If we're not inside a logging step and an executor is not
            # active, don't send to loggers. The executor check is to
            # handle the case where a user wants to log something after
            # execution.
            if self.executor is not None and not self.is_log_step:
                return

            if not self.loggers:
                warn(
                    "No loggers are attached to the trainer. "
                    "Call to trainer.log_metrics() will be a no-op."
                )

            from cerebras.modelzoo.trainer.loggers import TensorBoardLogger

            # Don't add prefixes to metric names if the tensorboard logger
            # is configured to output logs to legacy event directories
            if any(
                logger.legacy_event_dirs
                for logger in self.get_callbacks(TensorBoardLogger)
            ):
                prefix = ""
            else:
                prefix = self.name_scope_path

            # Otherwise, log the kwargs directly
            for logger in self.loggers:
                logger.log_metrics(
                    {f"{prefix}{k}": v for k, v in kwargs.items()},
                    step=self.global_step,
                )

[docs]    @final
    @contextmanager
    def name_scope(self, name: str):
        """Append name to the trainer's name scope stack whilst inside the
        context.

        Args:
            name: The name to append to the name scope stack.
        """
        try:
            self._name_scope_stack.append(name)
            yield name
        finally:
            self._name_scope_stack.pop()

    @final
    @property
    def name_scope_path(self) -> str:
        """Returns the current name scope path.

        This is the the name scope stack joined by '/'.
        """
        return os.path.join(*self._name_scope_stack, "")

[docs]    @final
    def get_val_dataloader_scope(self, val_dataloader):
        """Get the name scope for the given val dataloader."""
        if val_dataloader.id not in self._val_dataloader_id_map:
            self._val_dataloader_id_map[val_dataloader.id] = (
                f"validate_{self._val_dataloader_id}"
            )
            self._val_dataloader_id += 1

        return self._val_dataloader_id_map[val_dataloader.id]

[docs]    @final
    @cstorch.trace
    def training_step(self, batch) -> Dict[str, Any]:
        """Run a single training step on the given batch.

        Note that if retrace is off, content of this method will only run on
        the first iteration. So any inputs to this method must either be
        non-changing or torch tensors.

        Args:
            batch: The batch of data to train on.
            batch_idx: The index of the batch in the dataloader.

        Returns:
            A dictionary containing the loss and any other outputs.
        """

        outputs = self.forward(batch)
        self.backward(outputs)

        # Only run the optimizer step if an optimizer was defined
        # and we should run the optimizer step
        if self.optimizer and self.should_run_optimizer_step:
            self.optimizer_step()
            self.optimizer_zero_grad()
            self.schedulers_step()

        return outputs

[docs]    @final
    def forward(self, batch) -> Dict[str, Any]:
        """
        Run the forward pass on the given batch.

        Args:
            batch: The batch of data to run the forward pass on.

        Returns:
            A dictionary containing the loss and any other outputs.
        """
        if self.precision:
            ctx = self.precision.autocast_context_manager()
        else:
            ctx = nullcontext()

        with ctx:
            args = [batch]
            kwargs = {}

            self.call("on_before_forward", self.model, batch, args, kwargs)

            output = self.compiled_model(*args, **kwargs)
            outputs = convert_output_to_dict(output)

            self.call("on_after_forward", self.model, outputs, batch)

            return outputs

[docs]    @final
    def backward(self, outputs: dict):
        """
        Run the backward pass on the given loss.

        Args:
            outputs: The outputs of the model. Expect key 'loss' to be present.
        """
        self.call("on_before_backward", self.model, outputs)

        loss = outputs["loss"]

        if self.precision:
            self.precision.backward(loss)
        else:
            loss.backward()

        self.call("on_after_backward", self.model, outputs)

[docs]    @final
    def optimizer_step(self):
        """Run the optimizer step."""
        self.call("on_before_optimizer_step", self.model, self.optimizer)

        if self.precision:
            self.precision.clip_gradients(self.optimizer)
            self.precision.optimizer_step(self.optimizer)
        else:
            self.optimizer.step()

        self.call("on_after_optimizer_step", self.model, self.optimizer)

[docs]    @final
    def optimizer_zero_grad(self):
        """Zero the gradients of the optimizer."""
        self.call("on_before_optimizer_zero_grad", self.model, self.optimizer)

        self.optimizer.zero_grad()

        self.call("on_after_optimizer_zero_grad", self.model, self.optimizer)

[docs]    @final
    def schedulers_step(self):
        """Step all the schedulers."""
        if not self.schedulers:
            return

        for scheduler in self.schedulers:
            self.call(
                "on_before_scheduler_step",
                self.model,
                self.optimizer,
                scheduler,
            )
            scheduler.step()
            self.call(
                "on_after_scheduler_step",
                self.model,
                self.optimizer,
                scheduler,
            )

[docs]    @contextmanager
    def on_exception(self, hook):
        """Context manager to handle exceptions in the given hook.

        Args:
            hook: The hook to handle exceptions for.
        """
        try:
            yield
        except Exception as e:
            try:
                self.call(f"on_{hook}_exception", e)
            except Exception as e2:
                raise e2 from e
            raise

[docs]    @final
    def fit(
        self,
        train_dataloader: cstorch.utils.data.DataLoader,
        val_dataloader: Union[
            cstorch.utils.data.DataLoader,
            List[cstorch.utils.data.DataLoader],
            None,
        ] = None,
        ckpt_path: Optional[str] = ...,
    ):
        """Complete a full training run on the given train and validation dataloaders.

        Args:
            train_dataloader: The training dataloader.

            val_dataloader: The validation dataloader.

                If provided, validation is run every `eval_frequency` steps as defined
                in the loop callback.

                If not provided, only training is run.

                If a list of dataloaders is provided, then each dataloader is
                validated in sequence.

            ckpt_path: The path to the checkpoint to load before starting training.
                If not provided and `autoload_last_checkpoint` is True,
                then the latest checkpoint is loaded
        """
        loop = self.loop
        if not isinstance(loop, TrainingLoop):
            raise TypeError(
                f"Expected loop to be an instance of TrainingLoop. "
                f"Got: {type(loop)}"
            )

        with ExitStack() as fit_stack:
            self.call(
                "on_enter_fit",
                fit_stack,
                train_dataloader,
                val_dataloader,
                loop,
            )

            self.load_checkpoint(ckpt_path)

            self.call("on_fit_start", train_dataloader, val_dataloader, loop)

            for loop_idx in range(loop.num_trains):
                self._run_train(train_dataloader, loop, loop_idx)

                if loop.eval_frequency is not None and loop.eval_frequency != 0:
                    # Run upstream and downstream validation after each training iteration
                    self._validate_all(
                        val_dataloader,
                        ckpt_paths=None,
                        loop=None,
                        # pylint: disable=cell-var-from-loop
                        run_validation=lambda: self.call(
                            "run_validation",
                            loop_idx=loop_idx,
                            is_last=loop_idx == loop.num_trains - 1,
                        ),
                    )

            self.call("on_fit_end", loop)

    @final
    def _run_train(self, train_dataloader, loop, loop_idx=0):
        if not isinstance(loop, TrainingLoop):
            raise TypeError(
                f"Expected loop to be an instance of TrainingLoop. "
                f"Got: {type(loop)}"
            )

        with ExitStack() as stack:
            self.call("on_enter_train", stack, train_dataloader, loop, loop_idx)

            self.call(
                "on_train_start", self.model, train_dataloader, loop, loop_idx
            )

            self.executor = cstorch.utils.data.DataExecutor(
                train_dataloader,
                num_steps=loop.train_steps,
                checkpoint_steps=loop.checkpoint_steps,
                activation_steps=self.activation_steps,
                profiler_activities=[],  # Don't use data executor's profiler
            )

            for batch_idx, batch in enumerate(self.executor):
                self.call("on_train_batch_start", self.model, batch, batch_idx)

                outputs = self.training_step(batch)

                self.call(
                    "on_train_batch_end", self.model, outputs, batch, batch_idx
                )

            self.call("on_train_end", self.model, loop, loop_idx)

        self.executor = None

    @final
    @cstorch.trace
    @torch.no_grad()
    def validation_step(self, batch: Any) -> Dict[str, Any]:
        """Run a single validation step on the given batch and batch index.

        Note that if retrace is off, content of this method will only run on
        the first iteration. So any inputs to this method must either be
        constant or torch tensors.

        Args:
            batch: The batch of data to validate on.

        Returns:
            A dictionary containing the loss and any other outputs.
        """
        return self.forward(batch)

[docs]    @final
    def validate(
        self,
        val_dataloader: Optional[cstorch.utils.data.DataLoader] = None,
        ckpt_path: Optional[str] = ...,
        loop: Optional[ValidationLoop] = None,
    ):
        """Complete a full validation run on the validation dataloader.

        Args:
            val_dataloader: The validation dataloader.
                If a list of dataloaders is provided, then each dataloader is
                    validated in sequence.

            ckpt_path: The path to the checkpoint to load before starting validation.
                If not provided and `autoload_last_checkpoint` is True,
                then the latest checkpoint is loaded.

            loop: The loop callback to use for validation. If not provided, the default
                loop is used. If provided, it must be an instance of ValidationLoop.
                Note, this should only be provided if the loop callback provided in
                the constructor is not sufficient.
        """
        if not isinstance(val_dataloader, cstorch.utils.data.DataLoader):
            raise TypeError(
                f"val_dataloader must be a cstorch.utils.data.DataLoader. "
                f"Got {type(val_dataloader)}"
            )

        if not loop:
            loop = self.loop

            if isinstance(loop, TrainingLoop):
                loop = loop.val_loop

        if not isinstance(loop, ValidationLoop):
            raise TypeError(
                f"Expected loop to be an instance of ValidationLoop. "
                f"Got: {type(loop)}"
            )

        with ExitStack() as stack:
            stack.enter_context(
                self.name_scope(self.get_val_dataloader_scope(val_dataloader))
            )

            if loop not in self.all_callbacks:
                stack.enter_context(loop)

            self.call("on_enter_validate", stack, val_dataloader, loop)

            self.load_checkpoint(ckpt_path)

            self.call(loop.on_start_hook, self.model, val_dataloader, loop)

            # For every validation run we want to iterate the dataloader from
            # the scratch, so we make a shallow copy of the validation dataloader,
            # so the streamer will treat it as new dataloader.
            self.executor = cstorch.utils.data.DataExecutor(
                copy(val_dataloader),
                num_steps=loop.eval_steps,
                profiler_activities=[],  # Don't use data executor's profiler
            )

            for batch_idx, batch in enumerate(self.executor):
                self.call(
                    loop.on_batch_start_hook, self.model, batch, batch_idx
                )

                outputs = self.validation_step(batch)

                self.call(
                    loop.on_batch_end_hook,
                    self.model,
                    outputs,
                    batch,
                    batch_idx,
                )

            self.call(loop.on_end_hook, self.model, loop)

        self.executor = None

[docs]    @final
    def validate_all(
        self,
        val_dataloaders: Union[
            List[cstorch.utils.data.DataLoader],
            cstorch.utils.data.DataLoader,
            None,
        ] = None,
        ckpt_paths: Union[List[str], str, None] = ...,
        loop: Optional[ValidationLoop] = None,
    ):
        """
        Runs all upstream and downstream validation permutations.

        .. code:: python

            for ckpt_path in ckpt_paths:
                 for val_dataloader in val_dataloaders:
                     trainer.validate(val_dataloader, ckpt_path)

                 # run downstream validation
                 run_validation(...)

        Args:
            val_dataloaders: A list of validation dataloaders to run validation on.
            ckpt_paths: A list of checkpoint paths to run validation on. Each checkpoint
                path must be a path to a checkpoint file, or a glob pattern.
            loop: The validation loop to use for validation. If not provided, then the
                default loop is used.
        """
        self._validate_all(
            val_dataloaders,
            ckpt_paths,
            loop,
            run_validation=lambda: self.call(
                "run_validation", loop_idx=None, is_last=True
            ),
        )

    @final
    @wraps(validate_all)
    def _validate_all(self, val_dataloaders, ckpt_paths, loop, run_validation):
        if not callable(run_validation) and run_validation is not None:
            raise RuntimeError(
                f"Expected run_validation to be a callable or None. "
                f"Got: {run_validation}"
            )

        if val_dataloaders is None:
            val_dataloaders = []
        elif not isinstance(val_dataloaders, (list, tuple)):
            val_dataloaders = [val_dataloaders]

        for i, val_dataloader in enumerate(val_dataloaders):
            if not isinstance(val_dataloader, cstorch.utils.data.DataLoader):
                raise TypeError(
                    f"Expected val_dataloader to be an instance of DataLoader. "
                    f"Got {type(val_dataloader)} for element {i}"
                )

        if ckpt_paths is Ellipsis or ckpt_paths is None:
            ckpt_paths = [ckpt_paths]
        else:
            if not isinstance(ckpt_paths, (list, tuple)):
                ckpt_paths = [ckpt_paths]

            for ckpt_path in ckpt_paths:
                if not isinstance(ckpt_path, (str, Path)):
                    raise ValueError(
                        f"Expected ckpt_path to be a path to a checkpoint file, "
                        f"or a glob pattern. Got: {ckpt_path}"
                    )

                ckpt_path = Path(ckpt_path)
                if not any(ckpt_path.parent.glob(ckpt_path.name)):
                    raise FileNotFoundError(
                        f"Checkpoint file(s) not found at: {ckpt_path}"
                    )

            # Flatten all ckpt paths into a list
            ckpt_paths = (
                checkpoint_file
                for ckpt_path in map(Path, ckpt_paths)
                for checkpoint_file in ckpt_path.parent.glob(ckpt_path.name)
            )

        with ExitStack() as stack:
            self.call("on_enter_validate_all", stack, val_dataloaders, loop)

            for ckpt_path in ckpt_paths:
                # Load the checkpoint
                self.load_checkpoint(ckpt_path)

                # Run upstream validation
                for val_dataloader in val_dataloaders:
                    self.validate(val_dataloader, ckpt_path=None, loop=loop)

                # Run downstream validation
                if run_validation is not None:
                    run_validation()

[docs]    @final
    @cstorch.checkpoint_closure
    def save_checkpoint(self):
        """Save a checkpoint at the current global step.

        The checkpoint state dict is constructed by various callbacks
        that implement the `on_save_checkpoint` method.
        """
        state_dict = {}

        self.call("on_save_checkpoint", state_dict)
        self.call("postprocess_checkpoint", state_dict)

        ckpt_path = self.checkpoint.get_checkpoint_path(
            self.model_dir, self.global_step
        )

        # atomic checkpoint save by first writing to a temp file, then renaming
        tmp_ckpt_path = Path(f"{ckpt_path}.{str(uuid.uuid4())[:8]}.tmp")
        cstorch.save(state_dict, tmp_ckpt_path)
        tmp_ckpt_path.rename(ckpt_path)

        self.call("on_after_save_checkpoint", ckpt_path)

[docs]    @final
    def load_checkpoint(self, ckpt_path: Optional[str] = None):
        """Load a checkpoint from the given path.

        The checkpoint state dict is loaded and processed by various callbacks
        that implement the `on_load_checkpoint` method.

        Args:
            ckpt_path: The path to the checkpoint to load
                If not provided and `autoload_last_checkpoint` is True,
                then the latest checkpoint is loaded
        """
        # Don't load a checkpoint if compile/validate only
        if not self.backend.is_e2e_execution:
            return
        if ckpt_path is Ellipsis and self.checkpoint.autoload_last_checkpoint:
            ckpt_path = self.checkpoint.get_latest_checkpoint(self)
        if not ckpt_path or ckpt_path is Ellipsis:
            self.call("on_before_load_checkpoint", None)
            return

        self.call("on_before_load_checkpoint", ckpt_path)

        state_dict = cstorch.load(ckpt_path)

        self.call("preprocess_checkpoint", state_dict)
        self.call("on_load_checkpoint", state_dict)