Source code for cerebras.modelzoo.tools.checkpoint_converters.falcon_180b

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.falcon_40b import (
    Converter_Falcon_40B_Headless_HF_CS20,
    Converter_Falcon_40B_HF_CS20,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_Falcon_180B_Headless_HF_CS20(
    Converter_Falcon_40B_Headless_HF_CS20
):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            # Drop alibi slopes
            ConversionRule(
                [r"relative_pe_helper\.slopes"],
                exists="right",
                action=None,
            ),
            # 180B specific layernorms:
            ConversionRule(
                [
                    EquivalentSubkey("h", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey("input_layernorm.", "norm1."),
                    r"(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("h", "transformer_decoder.layers"),
                    r"\.\d+\.",
                    EquivalentSubkey("post_attention_layernorm.", "norm3."),
                    r"(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            *self.rules,
        ]

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Falcon_180B_HF_CS20


[docs]class Converter_Falcon_180B_HF_CS20(Converter_Falcon_40B_HF_CS20):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    "lm_head",
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("transformer.", ""),
                    Converter_Falcon_180B_Headless_HF_CS20(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Falcon_180B_HF_CS20


[docs]class ConfigConverter_Falcon_180B_HF_CS20(BaseConfigConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["model_type"],
                action=BaseConfigConverter.assert_factory_fn(0, "falcon"),
            ),
            # Embedding
            ConversionRule(["vocab_size"], action=self.replaceKey),
            ConversionRule(
                [EquivalentSubkey("alibi", "position_embedding_type")],
                action=self.convert_position_embedding_type,
            ),
            ConversionRule(
                ["rope_theta"],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "tie_word_embeddings", "share_embedding_weights"
                    )
                ],
                action=self.replaceKey,
            ),
            # Decoder Block
            ConversionRule(
                ["hidden_size"],
                action=self.convert_hidden_size,
            ),
            ConversionRule(
                [EquivalentSubkey("num_attention_heads", "num_heads")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_kv_heads", "extra_attention_params")],
                action=self.convert_num_head_groups,
            ),
            ConversionRule(
                ["num_hidden_layers"],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["max_position_embeddings"],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["nonlinearity"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(1, "gelu"),
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "attention_dropout", "attention_dropout_rate"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("hidden_dropout", "residual_dropout_rate")],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["layer_norm_epsilon"],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["use_bias_in_output"],
                action=BaseConfigConverter.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["initializer_range"],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["bias"],
                exists="left",
                action=self.convert_bias,
            ),
            ConversionRule(
                ["use_projection_bias_in_attention"],
                exists="right",
                action=self.convert_bias,
            ),
            ConversionRule(
                ["use_ffn_bias_in_attention"],
                exists="right",
                action=self.convert_bias,
            ),
            ConversionRule(
                ["use_ffn_bias"],
                exists="right",
                action=self.convert_bias,
            ),
            ConversionRule(
                ["alibi"],
                exists="left",
                action=BaseConfigConverter.assert_factory_fn(0, False),
            ),
            ConversionRule(
                ["new_decoder_architecture"],
                exists="left",
                action=self.assert_new_decoder_arch_and_parallel,
            ),
            ConversionRule(
                ["multi_query"], exists="left", action=self.convert_multi_query
            ),
            ConversionRule(
                ["parallel_attn"],
                exists="left",
                action=self.assert_new_decoder_arch_and_parallel,
            ),
            ConversionRule(
                ["use_untied_layer_norm"], exists="right", action=None
            ),
            ConversionRule(
                ["attention_module"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(
                    1, "multiquery_attention"
                ),
            ),
            ConversionRule(
                ["attention_type"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(
                    1, "scaled_dot_product"
                ),
            ),
        ]

        self.pre_convert_defaults[0].update(
            {
                "vocab_size": 65024,
                "hidden_size": 4544,
                "num_hidden_layers": 32,
                "num_attention_heads": 71,
                "layer_norm_epsilon": 1e-5,
                "initializer_range": 0.02,
                "use_cache": True,
                "hidden_dropout": 0.0,
                "attention_dropout": 0.0,
                "num_kv_heads": None,
                "alibi": False,
                "new_decoder_architecture": False,
                "multi_query": True,
                "parallel_attn": True,
                "bias": False,
                "max_position_embeddings": 2048,
                "rope_theta": 10000.0,
                "rope_scaling": None,
                "bos_token_id": 11,
                "eos_token_id": 11,
            }
        )
        self.pre_convert_defaults[1].update(
            {
                "position_embedding_type": "rotary",
                "rope_theta": 10000.0,
                "embedding_dropout_rate": 0.1,
                "share_embedding_weights": True,
                "nonlinearity": "gelu",
                "max_position_embeddings": 1024,
                "attention_module": "aiayn_attention",
                "attention_type": "scaled_dot_product",
                "use_untied_layer_norm": False,
                "extra_attention_params": {"num_kv_groups": 1},
            }
        )

        self.post_convert_defaults[0].update(
            {
                "model_type": "falcon",
                "new_decoder_architecture": True,
                "multi_query": True,
            }
        )
        self.post_convert_defaults[1].update(
            {
                "embedding_dropout_rate": 0.0,
                "share_embedding_weights": True,
                "nonlinearity": "gelu",
                "max_position_embeddings": 2048,
                "attention_module": "multiquery_attention",
                "attention_type": "scaled_dot_product",
                "use_untied_layer_norm": True,
                "extra_attention_params": {"num_kv_groups": 1},
                "loss_scaling": "num_tokens",
                "use_projection_bias_in_attention": False,
                "use_ffn_bias_in_attention": False,
                "use_ffn_bias": False,
                "use_bias_in_output": False,
            }
        )

    def assert_new_decoder_arch_and_parallel(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        assert from_index == 0, f"'{old_key}' should only be HF config"
        if (
            not old_state_dict["new_decoder_architecture"]
            and not old_state_dict["parallel_attn"]
        ):
            raise ConfigConversionError(
                "HF config must have either new_decoder_architecture or parallel_attn as True"
            )

    def convert_multi_query(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        assert from_index == 0, f"'{old_key}' should only be HF config"
        if (
            not old_state_dict["new_decoder_architecture"]
            and old_state_dict["multi_query"]
        ):
            if "extra_attention_params" not in new_state_dict:
                new_state_dict["extra_attention_params"] = {}
            new_state_dict["extra_attention_params"].update(
                {"num_kv_groups": 1}
            )

    def convert_num_head_groups(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            kv_groups = old_state_dict[old_key]
            if (
                not old_state_dict["new_decoder_architecture"]
                and old_state_dict["multi_query"]
            ):
                kv_groups = 1
            elif (
                not old_state_dict["new_decoder_architecture"]
                and not old_state_dict["multi_query"]
            ):
                kv_groups = old_state_dict["num_attention_heads"]
            new_state_dict[new_key] = {"num_kv_groups": kv_groups}
        elif from_index == 1:
            new_state_dict[new_key] = old_state_dict[old_key]["num_kv_groups"]

    def convert_bias(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            new_state_dict["use_projection_bias_in_attention"] = old_state_dict[
                old_key
            ]
            new_state_dict["use_ffn_bias_in_attention"] = old_state_dict[
                old_key
            ]
            new_state_dict["use_ffn_bias"] = old_state_dict[old_key]
        else:
            if (
                old_state_dict["use_projection_bias_in_attention"]
                != old_state_dict["use_ffn_bias_in_attention"]
                or old_state_dict["use_ffn_bias_in_attention"]
                != old_state_dict["use_ffn_bias"]
            ):
                raise ConfigConversionError(
                    "All of the following CS parameters must be set the same in order to convert "
                    "to HF: use_projection_bias_in_attention, use_ffn_bias_in_attention, "
                    "use_ffn_bias"
                )
            new_state_dict[new_key] = old_state_dict[old_key]

    def convert_position_embedding_type(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            if old_state_dict[old_key] == True:
                raise ConfigConversionError(
                    "CS model doesn't support falcon with position_embedding_type = alibi"
                )
            new_state_dict[new_key] = "rotary"
        else:
            if old_state_dict[old_key] not in ["rotary"]:
                raise ConfigConversionError(
                    f"HF model doesn't support falcon with position_embedding_type = "
                    f"{old_state_dict[old_key]}"
                )
            new_state_dict[new_key] = old_state_dict[old_key] == "alibi"

    def convert_hidden_size(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        new_state_dict[new_key] = old_state_dict[old_key]
        if from_index == 0:
            # Falcon uses 4 * hidden as intermediate size
            new_state_dict["filter_size"] = old_state_dict[old_key] * 4
        else:
            assert (
                old_state_dict[old_key] * 4 == old_state_dict["filter_size"]
            ), "HF model only supports filter_size = 4 * hidden_size"

    def parallel_attn_convert(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if old_state_dict[old_key] != True:
            raise ConfigConversionError(
                "parallel attention has to be enabled for falcon-180B"
            )
        new_state_dict[new_key] = True

    def post_config_convert(
        self,
        original_config,
        old_config,
        new_config,
        converter_indices,
        drop_unmatched_keys,
    ):
        if converter_indices.direction == 0:
            # falcon uses rotary_dim == head_dim
            new_config["rotary_dim"] = (
                old_config["hidden_size"] // old_config["num_attention_heads"]
            )

            new_config["use_untied_layer_norm"] = (
                old_config["new_decoder_architecture"]
                or not old_config["parallel_attn"]
            )

        else:
            # embedding dropout check
            assert (
                old_config["embedding_dropout_rate"] == 0.0
            ), "Falcon has no embedding dropout"

            # rotary check
            assert (
                old_config["rotary_dim"]
                == old_config["hidden_size"] // old_config["num_heads"]
            ), "rotary dimension of falcon is equal to head_dim"

            new_config["parallel_attn"] = True
            if not old_config["use_untied_layer_norm"]:
                new_config["new_decoder_architecture"] = False
                if new_config["num_kv_heads"] == 1:
                    new_config["multi_query"] = True
                elif (
                    new_config["num_kv_heads"]
                    == new_config["num_attention_heads"]
                ):
                    new_config["multi_query"] = False
                else:
                    raise ConfigConversionError(
                        "HF's falcon doesn't support use_untied_layer_norm=False"
                        "with grouped query attention (i.e. num_kv_groups != "
                        "num_heads and num_kv_groups != 1"
                    )
            else:
                new_config["new_decoder_architecture"] = True
                new_config["multi_query"] = new_config["num_kv_heads"] == 1

        return super().post_config_convert(
            original_config,
            old_config,
            new_config,
            converter_indices,
            drop_unmatched_keys,
        )

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.0"))


###########################################################
# In CS 2.1, we refactored the embedding layer.
# CS 2.0 <> CS 2.1, and HF <> CS 2.1 converters:
###########################################################


[docs]class ConfigConverter_Falcon_180B_HF_CS21(ConfigConverter_Falcon_180B_HF_CS20):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [EquivalentSubkey("rope_scaling", "pos_scaling_factor")],
                action=self.convert_pi,
            ),
            *self.rules,
        ]

        self.pre_convert_defaults[0].update(
            {
                "rope_scaling": None,
            }
        )
        self.pre_convert_defaults[1].update(
            {
                "pos_scaling_factor": 1.0,
            },
        )

    def convert_pi(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            if old_state_dict[old_key] is None:
                new_state_dict[new_key] = 1.0
            else:
                scaling_type = old_state_dict[old_key]["type"].lower()
                if scaling_type != "linear":
                    raise ValueError(
                        f"Only `rope_scaling` type `linear` is currently supported, "
                        f"but got type `{scaling_type}`."
                    )
                new_state_dict[new_key] = old_state_dict[old_key]["factor"]
        else:
            if old_state_dict[old_key] == 1.0:
                new_state_dict[new_key] = None
            else:
                new_state_dict[new_key] = {
                    "type": "linear",
                    "factor": old_state_dict[old_key],
                }

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2"))


[docs]class Converter_Falcon_180B_Headless_WithoutOptionalModel_HF_CS21(
    Converter_Falcon_180B_Headless_HF_CS20
):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["embedding_layer\.position_embed_helper\.slopes"],
                exists="right",
                action=None,
            ),
            *self.rules,
        ]


Converter_Falcon_180B_Headless_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_Falcon_180B_Headless_HF_CS21",
    Converter_Falcon_180B_Headless_WithoutOptionalModel_HF_CS21,
    derived_class=Converter_Falcon_180B_Headless_WithoutOptionalModel_HF_CS21,
    config_converter_class=ConfigConverter_Falcon_180B_HF_CS21,
    formats=(FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2")),
)


[docs]class Converter_Falcon_180B_WithoutOptionalModel_HF_CS21(
    BaseCheckpointConverter_HF_CS
):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["lm_head\.(?:weight|bias)"],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("transformer.", ""),
                    Converter_Falcon_180B_Headless_WithoutOptionalModel_HF_CS21(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Falcon_180B_HF_CS21


Converter_Falcon_180B_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_Falcon_180B_HF_CS21",
    Converter_Falcon_180B_WithoutOptionalModel_HF_CS21,
    derived_class=Converter_Falcon_180B_WithoutOptionalModel_HF_CS21,
)