Source code for cerebras.modelzoo.cli.data_preprocess_cli

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Cerebras ModelZoo Config Management CLI Tool"""

import argparse
import os
import shutil
from pathlib import Path

from cerebras.modelzoo.cli.utils import MZ_CLI_NAME


[docs]class DataPreprocessCLI:
    def __init__(self):
        parser = argparse.ArgumentParser()
        self.configure_parser(parser)
        args = parser.parse_args()
        args.func(args)

    @staticmethod
    def epilog():
        return (
            f"Use `{MZ_CLI_NAME} data_preprocess -h` to learn how to configure and run data preprocessing. "
            f"See below for some basic examples.\n\n"
            f"List all data preprocessing config variants:\n"
            f"  $ {MZ_CLI_NAME} data_preprocess list\n\n"
            f"Copy a data configuration file to a specified directory:\n"
            f"  $ {MZ_CLI_NAME} data_preprocess pull summarization_preprocessing -o workdir\n\n"
            f"Run data preprocessing using given configuration:\n"
            f"  $ {MZ_CLI_NAME} data_preprocess run --config workdir/summarization_preprocessing.yaml\n\n"
            f"For more information on data preprocessing, see: "
            f"https://docs.cerebras.net/en/latest/wsc/Model-zoo/Components/Data-preprocessing/data_preprocessing.html"
        )

    @staticmethod
    def configure_parser(parser):
        subparsers = parser.add_subparsers(dest="cmd", required=True)

        list_parser = subparsers.add_parser(
            "list", help="List all data config variants."
        )
        list_parser.set_defaults(func=DataPreprocessCLI._config_list)

        pull_parser = subparsers.add_parser(
            "pull",
            help="Saves a data config file with a given variant name to the local workspace.",
        )
        pull_parser.add_argument(
            "variant",
            help="Config variant name to load.",
        )
        pull_parser.add_argument(
            "-o",
            "--outdir",
            help="Directory to save config to. If not specified, saves to cwd.",
        )
        pull_parser.set_defaults(func=DataPreprocessCLI._config_pull)

        run_parser = subparsers.add_parser(
            "run", help="Runs data preprocessing."
        )
        run_parser.set_defaults(func=DataPreprocessCLI._preprocess)
        from cerebras.modelzoo.data_preparation.data_preprocessing.utils import (
            add_preprocess_args,
        )

        add_preprocess_args(run_parser)

    @staticmethod
    def _config_list(args):
        print(DataPreprocessCLI._list_configs())

    @staticmethod
    def _config_pull(args):
        config_path = DataPreprocessCLI._get_config_path()
        variant_path = config_path / (args.variant + ".yaml")

        if not variant_path.exists():
            raise ValueError(
                f"Variant {args.variant}  not found. Please specify a valid variant from:\n"
                f"{DataPreprocessCLI._list_configs()}"
            )

        outdir = Path(args.outdir if args.outdir else os.getcwd())

        print(f"Saving config {args.variant} to {outdir}/{args.variant}.yaml")

        outdir.mkdir(parents=True, exist_ok=True)

        shutil.copy(str(variant_path), str(outdir))

    @staticmethod
    def _list_configs():
        from tabulate import tabulate

        config_path = DataPreprocessCLI._get_config_path()
        config_list = list(config_path.glob("*.yaml"))

        table = []
        for config in config_list:
            row = [config.stem]
            table.append(row)
        headers = ["Available data preprocessing configurations"]
        return tabulate(table, headers=headers, tablefmt="fancy_grid")

    @staticmethod
    def _get_config_path():
        import cerebras.modelzoo.data_preparation.data_preprocessing as data_preprocessing

        return Path(data_preprocessing.__file__).parent / "configs"

    @staticmethod
    def _preprocess(args):
        from cerebras.modelzoo.data_preparation.data_preprocessing.preprocess_data import (
            preprocess_data,
        )
        from cerebras.modelzoo.data_preparation.data_preprocessing.utils import (
            args_to_params,
        )

        params = args_to_params(args)
        preprocess_data(params)


if __name__ == '__main__':
    DataPreprocessCLI()