Source code for cerebras.modelzoo.tools.checkpoint_converters.bert_finetune

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Tuple

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_CS_CS,
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatIndices,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.bert import (
    ConfigConverter_Bert_CS16_CS17,
    ConfigConverter_Bert_CS16_CS18,
    ConfigConverter_Bert_HF_CS17,
    ConfigConverter_Bert_HF_CS18,
    Converter_BertModel_CS16_CS17,
    Converter_BertModel_WithoutOptionalModel_HF_CS21,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_BertFinetuneModel_CS16_CS17(BaseCheckpointConverter_CS_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    r"bert\.",
                    Converter_BertModel_CS16_CS17(),
                ],
            ),
            ConversionRule(
                [r"classifier\.(?:weight|bias)"],
                action=self.replaceKey,
            ),
        ]

    def pre_checkpoint_convert(
        self,
        input_checkpoint,
        output_checkpoint,
        configs: Tuple[dict, dict],
        converter_indices: FormatIndices,
    ):
        # Don't copy non model keys like optimizer state:
        logging.warning(
            "The Bert model changed significantly between {} and {}. As a result, the"
            " optimizer state won't be included in the converted checkpoint.".format(
                *self.formats()
            )
        )
        output_checkpoint["model"] = {}

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("cs-1.6"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "BertForSequenceClassification, BertForTokenClassification, "
            "BertForQuestionAnswering, and BertForSummarization classes"
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Bert_CS16_CS17


[docs]class Converter_BertFinetuneModel_CS16_CS18(BaseCheckpointConverter_CS_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule(
                [
                    Converter_BertFinetuneModel_CS16_CS17(),
                ],
                action=None,
            ),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_BertFinetuneModel_CS16_CS17(),
                ],
                action=None,
            ),
        ]

    def pre_checkpoint_convert(
        self,
        input_checkpoint,
        output_checkpoint,
        configs: Tuple[dict, dict],
        converter_indices: FormatIndices,
    ):
        # Don't copy non model keys like optimizer state:
        logging.warning(
            "The Bert model changed significantly between {} and {}. As a result, the"
            " optimizer state won't be included in the converted checkpoint.".format(
                *self.formats()
            )
        )
        output_checkpoint["model"] = {}

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("cs-1.6"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )

    @classmethod
    def converter_note(cls) -> str:
        return (
            "BertForSequenceClassification, BertForTokenClassification, "
            "BertForQuestionAnswering, and BertForSummarization classes"
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Bert_CS16_CS18


[docs]class Converter_BertForSequenceClassification_HF_CS17(
    Converter_BertFinetuneModel_CS16_CS17, BaseCheckpointConverter_HF_CS
):
    def pre_checkpoint_convert(self, *args):
        return BaseCheckpointConverter_HF_CS.pre_checkpoint_convert(
            self,
            *args,
        )

    def extract_model_dict(self, *args):
        return BaseCheckpointConverter_HF_CS.extract_model_dict(self, *args)

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForSequenceClassification".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForSequenceClassification_HF_CS17


[docs]class Converter_BertForSequenceClassification_HF_CS18(
    BaseCheckpointConverter_HF_CS
):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule(
                [
                    Converter_BertForSequenceClassification_HF_CS17(),
                ],
                action=None,
            ),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_BertForSequenceClassification_HF_CS17(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForSequenceClassification".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForSequenceClassification_HF_CS18


[docs]class ConfigConverter_BertForSequenceClassification_HF_CS17(
    ConfigConverter_Bert_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Fine-tuning config params
            ConversionRule(
                [EquivalentSubkey("classifier_dropout", "task_dropout")],
                action=self.replaceKey,
            ),
            ConversionRule(["num_labels"], action=self.replaceKey),
            ConversionRule(["problem_type"], action=self.replaceKey),
            *self.rules,
        ]

    def pre_config_convert(
        self,
        model,
        config,
        converter_indices,
    ):
        config = super().pre_config_convert(model, config, converter_indices)

        # pylint: disable=line-too-long
        # From https://github.com/huggingface/transformers/blob/23c146c38b42d1193849fbd6f2943bf754b7c428/src/transformers/models/bert/modeling_bert.py#L1579
        if converter_indices.direction == 0:
            if "num_labels" not in config:
                if "id2label" in config:
                    config["num_labels"] = len(config["id2label"])
                else:
                    config["num_labels"] = 2

            if (
                "classifier_dropout" not in config
                or config["classifier_dropout"] is None
            ):
                config["classifier_dropout"] = config["hidden_dropout_prob"]
            if "problem_type" not in config or config["problem_type"] is None:
                if config["num_labels"] == 1:
                    config["problem_type"] = "regression"
                else:
                    raise ConfigConversionError(
                        "Cannot infer the problem_type (it is either single_label_classification "
                        "or multi_label_classification). Please explicitly include the "
                        "problem_type field before re-running."
                    )

        return config

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))


[docs]class ConfigConverter_BertForSequenceClassification_HF_CS18(
    ConfigConverter_BertForSequenceClassification_HF_CS17,
    ConfigConverter_Bert_HF_CS18,
):
    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )


[docs]class Converter_BertForTokenClassification_HF_CS17(
    Converter_BertFinetuneModel_CS16_CS17, BaseCheckpointConverter_HF_CS
):
    def pre_checkpoint_convert(
        self,
        *args,
    ):
        return BaseCheckpointConverter_HF_CS.pre_checkpoint_convert(
            self,
            *args,
        )

    def extract_model_dict(self, *args):
        return BaseCheckpointConverter_HF_CS.extract_model_dict(self, *args)

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForTokenClassification".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForTokenClassification_HF_CS17


[docs]class Converter_BertForTokenClassification_HF_CS18(
    BaseCheckpointConverter_HF_CS
):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule(
                [
                    Converter_BertForTokenClassification_HF_CS17(),
                ],
                action=None,
            ),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_BertForTokenClassification_HF_CS17(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForTokenClassification".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForTokenClassification_HF_CS18


[docs]class ConfigConverter_BertForTokenClassification_HF_CS17(
    ConfigConverter_Bert_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Fine-tuning config params
            ConversionRule(
                [
                    EquivalentSubkey(
                        "classifier_dropout", "encoder_output_dropout_rate"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_labels", "num_classes")],
                action=self.replaceKey,
            ),
            *self.rules,
        ]

    def pre_config_convert(
        self,
        model,
        config,
        converter_indices,
    ):
        config = super().pre_config_convert(model, config, converter_indices)

        # Additional Finetune specific defaults:
        if converter_indices.direction == 0:
            if "num_labels" not in config:
                if "id2label" in config:
                    config["num_labels"] = len(config["id2label"])
                else:
                    config["num_labels"] = 2
            if (
                "classifier_dropout" not in config
                or config["classifier_dropout"] is None
            ):
                config["classifier_dropout"] = config["hidden_dropout_prob"]

        return config

    def post_config_convert(
        self,
        model,
        original_config,
        old_config,
        new_config,
        converter_indices,
        drop_unmatched_keys,
    ):
        if converter_indices.direction == 0:
            if "loss_weight" not in new_config:
                new_config["loss_weight"] = 1.0
            if "include_padding_in_loss" not in new_config:
                new_config["include_padding_in_loss"] = False

        return super().post_config_convert(
            model,
            original_config,
            old_config,
            new_config,
            converter_indices,
            drop_unmatched_keys,
        )

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))


[docs]class ConfigConverter_BertForTokenClassification_HF_CS18(
    ConfigConverter_BertForTokenClassification_HF_CS17,
    ConfigConverter_Bert_HF_CS18,
):
    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )


[docs]class Converter_BertForQuestionAnswering_HF_CS17(BaseCheckpointConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    r"bert\.",
                    Converter_BertModel_CS16_CS17(),
                ],
            ),
            ConversionRule(
                [
                    EquivalentSubkey("qa_outputs", "classifier"),
                    r"\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForQuestionAnswering".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForQuestionAnswering_HF_CS17


[docs]class Converter_BertForQuestionAnswering_HF_CS18(BaseCheckpointConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule(
                [
                    Converter_BertForQuestionAnswering_HF_CS17(),
                ],
                action=None,
            ),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_BertForQuestionAnswering_HF_CS17(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )

    @classmethod
    def converter_note(cls) -> str:
        return "{} <-> {} for BertForQuestionAnswering".format(
            cls.formats()[0], cls.formats()[1]
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForQuestionAnswering_HF_CS18


[docs]class ConfigConverter_BertForQuestionAnswering_HF_CS17(
    ConfigConverter_Bert_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            # Fine-tuning config params
            ConversionRule(
                ["num_labels"],
                action=BaseConfigConverter.assert_factory_fn(0, 2),
            ),
            *self.rules,
        ]
        self.post_convert_defaults[0].update({"num_labels": 2})

    def pre_config_convert(
        self,
        model,
        config,
        converter_indices,
    ):
        config = super().pre_config_convert(model, config, converter_indices)

        # Additional Finetune specific defaults:
        if converter_indices.direction == 0:
            if "num_labels" not in config:
                if "id2label" in config:
                    config["num_labels"] = len(config["id2label"])
                else:
                    config["num_labels"] = 2

        return config

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))


[docs]class ConfigConverter_BertForQuestionAnswering_HF_CS18(
    ConfigConverter_BertForQuestionAnswering_HF_CS17,
    ConfigConverter_Bert_HF_CS18,
):
    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-1.8", "cs-1.9", "cs-2.0"),
        )


###########################################################
# In CS 2.1, we refactored the embedding layer.
# CS 2.0 <> CS 2.1, and HF <> CS 2.1 converters:
###########################################################

# Converter_Bert_CS17_CS18


[docs]class ConfigConverter_BertForSequenceClassification_HF_CS21(
    ConfigConverter_BertForSequenceClassification_HF_CS18
):
    "CS 2.1 config is the same as CS 2.0."

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )


[docs]class Converter_BertForSequenceClassification_WithoutOptionalModel_HF_CS21(
    Converter_BertForSequenceClassification_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    "bert\.",
                    Converter_BertModel_WithoutOptionalModel_HF_CS21(),
                ],
            ),
            *self.rules,
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForSequenceClassification_HF_CS21


Converter_BertForSequenceClassification_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_BertForSequenceClassification_HF_CS21",
    Converter_BertForSequenceClassification_WithoutOptionalModel_HF_CS21,
    derived_class=Converter_BertForSequenceClassification_WithoutOptionalModel_HF_CS21,
)


[docs]class ConfigConverter_BertForTokenClassification_HF_CS21(
    ConfigConverter_BertForTokenClassification_HF_CS18
):
    "CS 2.1 config is the same as CS 2.0."

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )


[docs]class Converter_BertForTokenClassification_WithoutOptionalModel_HF_CS21(
    Converter_BertForTokenClassification_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    "bert\.",
                    Converter_BertModel_WithoutOptionalModel_HF_CS21(),
                ],
            ),
            *self.rules,
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForTokenClassification_HF_CS21


Converter_BertForTokenClassification_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_BertForTokenClassification_HF_CS21",
    Converter_BertForTokenClassification_WithoutOptionalModel_HF_CS21,
    derived_class=Converter_BertForTokenClassification_WithoutOptionalModel_HF_CS21,
)


[docs]class ConfigConverter_BertForQuestionAnswering_HF_CS21(
    ConfigConverter_BertForQuestionAnswering_HF_CS18
):
    "CS 2.1 config is the same as CS 2.0."

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )


[docs]class Converter_BertForQuestionAnswering_WithoutOptionalModel_HF_CS21(
    Converter_BertForQuestionAnswering_HF_CS17
):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    "bert\.",
                    Converter_BertModel_WithoutOptionalModel_HF_CS21(),
                ],
            ),
            *self.rules,
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (
            FormatVersions("hf"),
            FormatVersions("cs-2.1", "cs-2.2", "cs-2.3", "cs-2.4"),
        )

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_BertForQuestionAnswering_HF_CS21


Converter_BertForQuestionAnswering_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel(
    "Converter_BertForQuestionAnswering_HF_CS21",
    Converter_BertForQuestionAnswering_WithoutOptionalModel_HF_CS21,
    derived_class=Converter_BertForQuestionAnswering_WithoutOptionalModel_HF_CS21,
)