Source code for cerebras.modelzoo.tools.checkpoint_converters.mixtral

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)
from cerebras.modelzoo.tools.checkpoint_converters.llama import (
    ConfigConverter_LLaMa_HF_CS21,
    Converter_LlamaAttention_HF_CS,
)

MAGIC_STR = "__"


[docs]class Converter_MixtralModel_HF_CS(BaseCheckpointConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            # word embeddings
            ConversionRule(
                [
                    EquivalentSubkey(
                        "embed_tokens", "embedding_layer.word_embeddings"
                    ),
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            # final layer norm
            ConversionRule(
                [
                    EquivalentSubkey("norm", "transformer_decoder.norm"),
                    r"\.(?:weight|bias)",
                ],
                action=self.replace_final_norm,
            ),
            # attention
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.self_attn\.",
                    Converter_LlamaAttention_HF_CS(),
                ],
                action=None,
            ),
            # Rotary embedding
            ConversionRule(
                [r"layers\.\d+\.self_attn\.rotary_emb\.inv_freq"],
                exists="left",
                action=None,
            ),
            # attention norm
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey("input_layernorm", "norm1"),
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey("post_attention_layernorm", "norm3"),
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            # moe ffn
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey("block_sparse_moe.gate", "ffn.gate"),
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            *self.moe_rules(),
            ConversionRule([r"lm_head\.(?:weight|bias)"], exists="right"),
            ConversionRule([r"ln_f\.(?:weight|bias)"], exists="right"),
        ]

    def replace_final_norm(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        new_state_dict[new_key] = old_state_dict[old_key]
        # CS 1.7 has both "ln_f" and "transformer_decoder.norm"
        # we need to copy the original ("ln_f") too:
        if from_index == 0:
            ln_f_key = re.sub(r"transformer_decoder\.norm\.", "ln_f.", new_key)
            new_state_dict[ln_f_key] = old_state_dict[old_key]

    def moe_rules(self):
        return self.moe_optimized_impl_rules()

    def moe_functional_impl_rules(self):
        return [
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts", "ffn.experts.experts"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("w1", "ffn.0.linear_layer_for_glu"),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts", "ffn.experts.experts"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("w3", "ffn.0.linear_layer"),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts", "ffn.experts.experts"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("w2", "ffn.1.linear_layer"),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
        ]

    def moe_optimized_impl_rules(self):
        return [
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        "ffn.experts",
                    ),
                    EquivalentSubkey(
                        ".0.w1.weight",
                        ".fused_ffns.0.linear_layer_for_glu.expert_weights",
                    ),
                ],
                action=self.convert_expert_weights,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        f"ffn.experts{MAGIC_STR}",
                    ),
                    r"\.\d+",
                    EquivalentSubkey(
                        ".w1.weight",
                        ".fused_ffns.0.linear_layer_for_glu.expert_weights",
                    ),
                ],
                action=self.assert_already_converted,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        "ffn.experts",
                    ),
                    EquivalentSubkey(
                        ".0.w3.weight",
                        ".fused_ffns.0.linear_layer.expert_weights",
                    ),
                ],
                action=self.convert_expert_weights,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        f"ffn.experts{MAGIC_STR}",
                    ),
                    r"\.\d+",
                    EquivalentSubkey(
                        ".w3.weight",
                        ".fused_ffns.0.linear_layer.expert_weights",
                    ),
                ],
                action=self.assert_already_converted,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        "ffn.experts",
                    ),
                    EquivalentSubkey(
                        ".0.w2.weight",
                        ".fused_ffns.1.linear_layer.expert_weights",
                    ),
                ],
                action=self.convert_expert_weights,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("layers", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "block_sparse_moe.experts",
                        f"ffn.experts{MAGIC_STR}",
                    ),
                    r"\.\d+",
                    EquivalentSubkey(
                        ".w2.weight",
                        ".fused_ffns.0.linear_layer.expert_weights",
                    ),
                ],
                action=self.assert_already_converted,
            ),
        ]

    def convert_expert_weights(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        num_experts = action_fn_args['configs'][1]['model']['moe'][
            'num_experts'
        ]
        if from_index == 0:
            # Fuse weights across experts.
            expert_weights = []
            for expert in range(num_experts):
                curr_old_key = re.sub(
                    r"experts\.0", f"experts.{expert}", old_key
                )
                expert_weights.append(old_state_dict[curr_old_key])

            expert_weights = [v.unsqueeze(1) for v in expert_weights]
            new_state_dict[new_key] = torch.concat(expert_weights, dim=1)
        else:
            # Unfuse weights.
            for expert in range(num_experts):
                curr_new_key = re.sub(
                    r"experts\.0", f"experts.{expert}", new_key
                )
                new_state_dict[curr_new_key] = old_state_dict[old_key][
                    :, expert
                ]

    def assert_already_converted(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            new_key = re.sub(f"{MAGIC_STR}.\d+", "", new_key)
            assert (
                new_key in new_state_dict
            ), f"Expected {new_key} to be in new_state_dict"
        else:
            assert False, "Unreachable"

    def post_model_convert(
        self,
        old_state_dict,
        new_state_dict,
        configs,
        converter_indices,
        drop_unmatched_keys,
        key_prefix="",
    ):
        if converter_indices.direction == 0:
            # We are converting from HF LlamaModel (which is headless) ->
            # CS GPT2LMHeadModel configured as llama (which has a head)
            # We need to create 'lm_head' and init to default values
            logging.warning(
                f"{self.formats()[1]} has a language model head (lm_head) "
                f"while {self.formats()[0]} does not. Initializing lm_head to default."
            )
            hf_config = configs[0]
            cs_config = configs[1]
            use_bias_in_output = cs_config["model"].get(
                "use_bias_in_output", False
            )
            vocab_size = cs_config["model"]["vocab_size"]
            embed_dim = cs_config["model"]["hidden_size"]
            if hf_config["tie_word_embeddings"]:
                lm_head_weight = old_state_dict['embed_tokens.weight']
            else:
                lm_head_weight = torch.zeros((vocab_size, embed_dim))
                lm_head_weight.normal_(mean=0.0, std=0.02)
            new_state_dict[key_prefix + "lm_head.weight"] = lm_head_weight
            if use_bias_in_output:
                lm_head_bias = torch.zeros(vocab_size)
                new_state_dict[key_prefix + "lm_head.bias"] = lm_head_bias
        super().post_model_convert(
            old_state_dict,
            new_state_dict,
            configs,
            converter_indices,
            drop_unmatched_keys,
            key_prefix=key_prefix,
        )

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return None


[docs]class Converter_MixtralForCausalLM_HF_CS(BaseCheckpointConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [r"lm_head\.(?:weight|bias)"],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("model.", ""),
                    Converter_MixtralModel_HF_CS(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return None


[docs]class Converter_MixtralModel_WithoutOptionalModel_HF_CS23(
    Converter_MixtralModel_HF_CS
):
    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.3", "cs-2.4"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Mixtral_HF_CS23

    @classmethod
    def converter_note(cls) -> str:
        return (
            f"{cls.formats()[0]} MixtralModel <-> {cls.formats()[1]} GPT2LMHeadModel (configured as "
            f"Mixtral)\nThe HF model doesn't contain a language model head while the CS one does. "
            f"When converting to CS, the exported checkpoint will contain a language model head "
            f"initialized to default random values. When converting to HF, the language model head "
            f"will be dropped."
        ).format(cls.formats()[0], cls.formats()[1])


[docs]class Converter_MixtralLMHeadModel_WithoutOptionalModel_HF_CS23(
    Converter_MixtralForCausalLM_HF_CS
):
    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.3", "cs-2.4"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Mixtral_HF_CS23

    @classmethod
    def converter_note(cls) -> str:
        return "{} MixtralForCausalLM <-> {} GPT2LMHeadModel (configured as Mixtral)".format(
            cls.formats()[0], cls.formats()[1]
        )


[docs]class ConfigConverter_Mixtral_HF_CS23(ConfigConverter_LLaMa_HF_CS21):
    def __init__(self):
        self.model_type = "mixtral"
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    EquivalentSubkey(
                        "sliding_window", "attention_sliding_window_length"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_local_experts", "moe")],
                action=self.convert_moe_params,
            ),
            *self.rules,
        ]

        self.post_convert_defaults[0].update({"model_type": "mixtral"})

    def convert_moe_params(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            if "moe" not in new_state_dict:
                new_state_dict["moe"] = {}
            new_state_dict["moe"]["num_experts"] = old_state_dict[
                "num_local_experts"
            ]
            new_state_dict["moe"]["top_k"] = old_state_dict[
                "num_experts_per_tok"
            ]
            new_state_dict["moe"]["load_balancing_loss_coef"] = old_state_dict[
                "router_aux_loss_coef"
            ]
        else:
            new_state_dict["num_local_experts"] = old_state_dict["moe"][
                "num_experts"
            ]
            new_state_dict["num_experts_per_tok"] = old_state_dict["moe"][
                "top_k"
            ]
            new_state_dict["router_aux_loss_coef"] = old_state_dict["moe"][
                "load_balancing_loss_coef"
            ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.3", "cs-2.4"))


Converter_MixtralModel_HF_CS23 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_MixtralModel_HF_CS23",
    Converter_MixtralModel_WithoutOptionalModel_HF_CS23,
    derived_class=Converter_MixtralModel_WithoutOptionalModel_HF_CS23,
)

Converter_MixtralForCausalLM_HF_CS23 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_MixtralForCausalLM_HF_CS23",
    Converter_MixtralLMHeadModel_WithoutOptionalModel_HF_CS23,
    derived_class=Converter_MixtralLMHeadModel_WithoutOptionalModel_HF_CS23,
)