Source code for cerebras.modelzoo.data_preparation.nlp.transformer.create_meta

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Create meta file for transformer in pytorch.
Stores meta file in source directory (`src_dir`).
"""

import argparse
import os
from subprocess import run


[docs]def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "--src_dir", type=str, required=True, help="Path to the original source language dataset.", ) parser.add_argument( "--tgt_dir", type=str, required=True, help="Path to the translated target language dataset.", ) args = parser.parse_args() result = [] for file_name in sorted(os.listdir(args.src_dir)): # Counting number of lines in the files with subprocess in bash. cmd = f"wc -l {args.src_dir}/{file_name}" with open("foo.txt", "w") as fout: run(cmd.split(), stdout=fout) with open("foo.txt", "r") as fin: num_examples = int(fin.read().split()[0]) result.append((file_name, num_examples)) total_num_examples = 0 with open(f"{args.src_dir}/meta.dat", "w") as fout: for i, (file_name, num_examples) in enumerate(result): total_num_examples += num_examples fout.write( f"{args.src_dir}/{file_name} {args.tgt_dir}/{file_name.split('en')[0]}de{file_name.split('en')[1]} {num_examples}" ) if i != len(result) - 1: fout.write("\n") print(f"Number of examples: {total_num_examples}.")
if __name__ == "__main__": main()