cerebras.modelzoo.data.nlp.t5.T5DynamicDataProcessor.T5DynamicDataProcessorConfig#

class cerebras.modelzoo.data.nlp.t5.T5DynamicDataProcessor.T5DynamicDataProcessorConfig(*args, **kwargs)[source]#

Bases: cerebras.modelzoo.config.data_config.DataConfig

Methods

`check_for_deprecated_fields`
`check_literal_discriminator_field`
`copy`
`get_orig_class`
`get_orig_class_args`
`model_copy`
`model_post_init`
`post_init`
`validate_max_sequence_length`

Attributes

`batch_size`	Number of sequences per batch.
`buckets`	A list of boundaries for sequence lengths to bucket together in order to speed up VTS/VSL.
`discriminator`
`discriminator_value`
`do_lower`	If True, will lowercase all tokens in vocabulary.
`drop_last`	If the last batch is not the full size, i.e. the dataset could not divide evenly into the batch-size, do not use the last batch.
`dynamic_loss_weight`	If set, will divide the loss for a token by the length of the sequence that the token comes from.
`eos_token`	Token for end-of-sequence
`extra_ids`	Number of sentinel tokens for T5 objective
`fp16_type`
`input_pad_id`	Can set specific padding for inputs
`labels_pad_id`	Can set specific padding for labels
`mixed_precision`
`model_config`
`num_documents_to_concatenate`	Specifies how many documents to pack together
`num_workers`	Number of processes that move data to the accelerator system, so that the system doesn't process data faster than it receives it.
`oov_token`	Token for out-of-vocabulary words/sub-words
`pack_sequences`	If set, will concatenate sequences so that computation is performed on real data rather than padding
`pad_token`	Token for padding
`persistent_workers`	If set, workers will not be shutdown after going through the dataset once.
`prefetch_factor`	Number of batch loaded in advance by each worker.
`shuffle`	If true the data will be shuffled before passing into the model.
`shuffle_buffer`	Size of buffer used to store data before shuffling
`shuffle_seed`	Sets random seed for the order of data shuffling.
`sos_token`	Token for start-of-sequence
`src_data_dir`	Path to directory containing the output of preprocess.sh, with all the files of tokenized data.
`src_max_sequence_length`	Largest possible sequence length for the input.
`src_vocab_file`	Path to file containing tokens of vocabulary, one token per line.
`tgt_data_dir`
`tgt_max_sequence_length`	Largest possible sequence length for the labels.
`tgt_vocab_file`
`vocab_size`
`data_processor`

src_vocab_file = Ellipsis#: Path to file containing tokens of vocabulary, one token per line.

src_data_dir = Ellipsis#: Path to directory containing the output of preprocess.sh, with all the files of tokenized data.

batch_size = Ellipsis#: Number of sequences per batch. Note that it is different between systems.

shuffle = True#: If true the data will be shuffled before passing into the model. Recommended for training. Can be set to False for debugging.

shuffle_seed = None#: Sets random seed for the order of data shuffling. Allows for reproducibility while still shuffling data.

shuffle_buffer = None#: Size of buffer used to store data before shuffling

extra_ids = 0#: Number of sentinel tokens for T5 objective

src_max_sequence_length = Ellipsis#: Largest possible sequence length for the input. If longer it will be truncated. All other sequences padded to this length.

tgt_max_sequence_length = Ellipsis#: Largest possible sequence length for the labels. If longer it will be truncated. All other sequences padded to this length.

num_workers = 0#: Number of processes that move data to the accelerator system, so that the system doesn’t process data faster than it receives it.

drop_last = True#: If the last batch is not the full size, i.e. the dataset could not divide evenly into the batch-size, do not use the last batch.

prefetch_factor = 10#: Number of batch loaded in advance by each worker.

persistent_workers = True#: If set, workers will not be shutdown after going through the dataset once.

do_lower = False#: If True, will lowercase all tokens in vocabulary. T5’s vocabulary is cased so this is not recommended.

buckets = None#: A list of boundaries for sequence lengths to bucket together in order to speed up VTS/VSL.

dynamic_loss_weight = False#: If set, will divide the loss for a token by the length of the sequence that the token comes from.

pack_sequences = False#: If set, will concatenate sequences so that computation is performed on real data rather than padding

num_documents_to_concatenate = 128#: Specifies how many documents to pack together

oov_token = '<unk>'#: Token for out-of-vocabulary words/sub-words

sos_token = '<s>'#: Token for start-of-sequence

eos_token = '</s>'#: Token for end-of-sequence

pad_token = '<pad>'#: Token for padding

labels_pad_id = None#: Can set specific padding for labels

input_pad_id = None#: Can set specific padding for inputs

cerebras.modelzoo.data.nlp.t5.T5DynamicDataProcessor.T5DynamicDataProcessor

cerebras.modelzoo.data.nlp.t5.T5HDF5DataProcessor