Source code for cerebras.modelzoo.tools.checkpoint_converters.mup

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from typing import Tuple

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseConfigConverter,
    BaseDictionaryConverter,
    ConversionRule,
    FormatVersions,
)


[docs]class ConfigConverter_sP_muP(BaseConfigConverter):
    """Transforms a CS muP config to a CS sP config."""

[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(["output_logits_scale"]),
            ConversionRule(["embeddings_scale"]),
            ConversionRule(["scale_qk_dot_by_d"]),
            ConversionRule(
                ["share_embedding_weights"],
                action=self.set_share_embedding_weights,
            ),
            ConversionRule(
                [r".*"], action=self.replaceKey
            ),  # Catch-all for everything else
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return ("sP", "muP")

    @staticmethod
    def file_formats() -> Tuple[str, str]:
        return ()

    @staticmethod
    def is_mup(config):
        return _is_mup(config)

    def set_share_embedding_weights(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 1 and (
            "output_logits_scale" in old_state_dict
            or "embeddings_scale" in old_state_dict
        ):
            new_state_dict[new_key] = False
        else:
            new_state_dict[new_key] = old_state_dict[old_key]


[docs]class Converter_sP_muP(BaseDictionaryConverter):
    """Transforms a CS muP checkpoints into a CS sP checkpoint.

    muP: Maximal Update Parametrization.
    sP: Standard Parametrization.
    """

[docs]    def __init__(self):
        super().__init__()

        self.rules = [
            ConversionRule(
                [r".+\.proj_k_dense_layer.*"],
                action=self.scale_k_projection,
            ),
            ConversionRule(
                [r"(?:model\.|)lm_head\.weight"],
                action=self.scale_lm_head,
            ),
            ConversionRule(
                [r"(?:model\.|)embedding_layer\.word_embeddings\.weight"],
                action=self.scale_embeddings,
            ),
            ConversionRule(
                [
                    r"(?:model\.|)embedding_layer\.position_embeddings(?:\.embed)?\.weight"
                ],
                action=self.scale_embeddings,
            ),
            ConversionRule(
                [r"(?:model\.|)embedding_ln_f\.(?:weight|bias)"],
                action=self.scale_embedding_layernorm,
            ),
            ConversionRule(
                [r".*"], action=self.replaceKey
            ),  # Catch-all for everything else
        ]

    def scale_k_projection(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        config = action_fn_args["configs"][1]

        if config["model"].get('scale_qk_dot_by_d', False):
            d_model = config["model"]["hidden_size"]
            n_heads = config["model"]["num_heads"]
            d_sqrt = math.sqrt(d_model // n_heads)

            new_state_dict[new_key] = old_state_dict[old_key] / d_sqrt
        else:
            new_state_dict[new_key] = old_state_dict[old_key]

    def scale_lm_head(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        config = action_fn_args["configs"][1]

        if "output_logits_scale" in config["model"]:
            output_scale = config["model"]["output_logits_scale"]

            new_state_dict[new_key] = old_state_dict[old_key] * output_scale
        else:
            new_state_dict[new_key] = old_state_dict[old_key]

    def scale_embeddings(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        config = action_fn_args["configs"][1]

        # Fold embeddings_scale into word/position embeddings if embedding
        # layer norm *is not* enabled
        if "embeddings_scale" in config["model"] and not config["model"].get(
            "embedding_layer_norm", False
        ):
            emb_scale = config["model"]["embeddings_scale"]

            new_state_dict[new_key] = old_state_dict[old_key] * emb_scale
        else:
            new_state_dict[new_key] = old_state_dict[old_key]

    def scale_embedding_layernorm(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        config = action_fn_args["configs"][1]

        # Fold embeddings_scale into embedding layer norm if embedding
        # layer norm *is* enabled
        if "embeddings_scale" in config["model"] and config["model"].get(
            "embedding_layer_norm", False
        ):
            emb_scale = config["model"]["embeddings_scale"]

            new_state_dict[new_key] = old_state_dict[old_key] * emb_scale
        else:
            new_state_dict[new_key] = old_state_dict[old_key]

    @staticmethod
    def is_mup(config):
        return _is_mup(config.get('model', {}))

    @staticmethod
    def formats():
        return ("sP", "muP")


def _is_mup(model_config):
    scale_qk_dot_by_d = model_config.get('scale_qk_dot_by_d', False)
    embeddings_scale = model_config.get('embeddings_scale', None)
    output_logits_scale = model_config.get('output_logits_scale', None)

    all_set = scale_qk_dot_by_d and embeddings_scale and output_logits_scale
    any_set = scale_qk_dot_by_d or embeddings_scale or output_logits_scale

    if any_set and not all_set:
        raise ValueError(
            "This looks like an incomplete muP config. Either all of or none of "
            "\"scale_qk_dot_by_d\", \"embeddings_scale\", \"output_logits_scale\" can be "
            "specified, but this config only has some that are specified."
        )

    return all_set