Source code for cerebras.modelzoo.tools.checkpoint_converters.roberta

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.bert import (
    ConfigConverter_Bert_HF_CS18,
    Converter_BertLayerNorm_HF_CS,
    Converter_BertModel_CS16_CS17,
    Converter_BertModel_WithoutOptionalModel_HF_CS21,
    Converter_BertPretrainModel_HF_CS18,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_RobertaPretrainModel_HF_CS(BaseCheckpointConverter_HF_CS):
[docs] def __init__(self): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey("roberta.", "bert_encoder."), Converter_BertModel_CS16_CS17(), # CS16 = HF ], ), # CLS: ConversionRule( [ EquivalentSubkey( "lm_head.dense", "bert_mlm_head.mlm_transform.ffn.ffn.0.linear_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "lm_head.", "bert_mlm_head.mlm_transform.", ), Converter_BertLayerNorm_HF_CS("layer_norm", "ln"), ], action=None, ), ConversionRule( [ EquivalentSubkey( "lm_head.decoder", "bert_mlm_head.classifier.ffn.0.linear_layer", ), r"\.weight", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "lm_head.decoder", "bert_mlm_head.classifier.ffn.0.linear_layer", ), r"\.bias", ], action=self.convert_cls_predictions_bias, ), ConversionRule([r"lm_head\.bias"], exists="left"), ]
@staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None def convert_cls_predictions_bias( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): self.replaceKey( old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ) if from_index == 1: # HF stores an extra copy of the decoder bias in the predictions object itself bias_key = re.sub(r"\.decoder\.", ".", new_key) self.replaceKey( old_key, bias_key, old_state_dict, new_state_dict, from_index, action_fn_args, )
[docs]class Converter_RobertaPretrainModel_HF_CS18( Converter_BertPretrainModel_HF_CS18 ):
[docs] def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_RobertaPretrainModel_HF_CS(), ], action=None, ), # Catch checkpoints from 1.7/1.8 ConversionRule( [ EquivalentSubkey("", "model."), Converter_RobertaPretrainModel_HF_CS(), ], action=None, ), ]
@classmethod def converter_note(cls) -> str: return "{} <-> {} for RobertaForPreTraining".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_Roberta_HF_CS18
[docs] def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 1: num_segments = configs[1]["model"]["num_segments"] if not num_segments: new_state_dict[ key_prefix + "roberta.embeddings.token_type_embeddings.weight" ] = torch.zeros( configs[0]["type_vocab_size"], configs[0]["hidden_size"] ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, )
[docs]class ConfigConverter_Roberta_HF_CS18(ConfigConverter_Bert_HF_CS18):
[docs] def __init__(self): super().__init__() # Override Bert's config converter with the following: self.rules = [ ConversionRule( ["model_type"], action=BaseConfigConverter.assert_factory_fn(0, "roberta"), ), ConversionRule( ["max_position_embeddings"], action=self.convert_max_pos_embed, ), ConversionRule( [EquivalentSubkey("type_vocab_size", "num_segments")], action=self.convert_num_segments, ), ConversionRule( ["pad_token_id"], action=self.replaceKey, ), ConversionRule( ["mask_padding_in_positional_embed"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["disable_nsp"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["mlm_nonlinearity"], action=BaseConfigConverter.assert_factory_fn(1, "gelu"), ), *self.rules, ] self.pre_convert_defaults[0].update( { "vocab_size": 50265, "position_embedding_type": "absolute", "type_vocab_size": 2, "pad_token_id": 1, } ) self.pre_convert_defaults[1].update( { "disable_nsp": False, "pad_token_id": 0, "mask_padding_in_positional_embed": False, } ) self.post_convert_defaults[0].update({"model_type": "roberta"}) self.post_convert_defaults[1].update( { "disable_nsp": True, "mask_padding_in_positional_embed": True, } )
def convert_num_segments( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): # CS allows segment embeddings to be disabled while HF doesn't # When it is disabled in CS, we need to enable it in HF and set the # embedding weight to zero if from_index == 1 and old_state_dict[old_key] == 0: new_state_dict[new_key] = 1 else: new_state_dict[new_key] = old_state_dict[old_key] def convert_max_pos_embed( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): # The number of positional embeddings = MSL + pad token offset + 1 # HF refers to number of positional embeddings (the total) as # max_position_embeddings while we refer to MSL as # max_position_embeddings if from_index == 0: new_state_dict[new_key] = ( old_state_dict[old_key] - old_state_dict["pad_token_id"] - 1 ) else: new_state_dict[new_key] = ( old_state_dict[old_key] + old_state_dict["pad_token_id"] + 1 ) def pre_config_convert( self, config, converter_indices, ): config = super().pre_config_convert(config, converter_indices) if converter_indices.direction == 1: if "num_segments" not in config: config["num_segments"] = 0 if config["disable_nsp"] else 2 return config def post_config_convert( self, original_config, old_config, new_config, converter_indices, drop_unmatched_keys, ): if converter_indices.direction == 0: new_config["mlm_nonlinearity"] = "gelu" return super().post_config_convert( original_config, old_config, new_config, converter_indices, drop_unmatched_keys, )
########################################################### # In CS 2.1, we refactored the embedding layer. # CS 2.0 <> CS 2.1, and HF <> CS 2.1 converters: ###########################################################
[docs]class ConfigConverter_Roberta_HF_CS21(ConfigConverter_Roberta_HF_CS18): "CS 2.1 config is the same as CS 2.0" @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2"))
[docs]class Converter_RobertaPretrainModel_WithoutOptionalModel_HF_CS21( Converter_RobertaPretrainModel_HF_CS ):
[docs] def __init__(self): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey("roberta.", "bert_encoder."), Converter_BertModel_WithoutOptionalModel_HF_CS21(), # CS16 = HF ], ), *self.rules, ]
@staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_Roberta_HF_CS21
Converter_RobertaPretrainModel_HF_CS21 = ( Build_HF_CS_Converter_WithOptionalModel( "Converter_RobertaPretrainModel_HF_CS21", Converter_RobertaPretrainModel_WithoutOptionalModel_HF_CS21, derived_class=Converter_RobertaPretrainModel_HF_CS18, config_converter_class=ConfigConverter_Roberta_HF_CS21, formats=(FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2")), ) )