Source code for cerebras.modelzoo.common.run_cstorch_eval_harness

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Eval Harness run script"""
import argparse
import inspect
import logging
import os
import sys

# isort: off
sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
# isort: on
from cerebras.modelzoo.common.utils.run.cli_parser import get_params_from_args
from cerebras.modelzoo.common.utils.run.utils import DeviceType
from cerebras.modelzoo.config_manager.config_loader import (
    validate_config_params,
)


[docs]def setup_hf_env_vars(hf_cache_dir=None):
    from cerebras.appliance.environment import appliance_environ

    # Removes annoying logs relating to process forking
    appliance_environ["TOKENIZERS_PARALLELISM"] = "false"

    if hf_cache_dir is not None:
        appliance_environ["TRANSFORMERS_CACHE"] = hf_cache_dir
        appliance_environ["HF_HOME"] = hf_cache_dir
        appliance_environ["HF_DATASETS_CACHE"] = hf_cache_dir


[docs]def eeh_parser():
    parser = argparse.ArgumentParser(
        "Script for running Eleuther Eval Harness for GPT style models",
        add_help=False,
    )
    optional_arguments = parser.add_argument_group(
        "Eleuther Eval Harness Arguments"
    )
    # EEH-SPECIFIC ARGS
    # Ref: https://github.com/EleutherAI/lm-evaluation-harness/blob/c9bbec6e7de418b9082379da82797522eb173054/lm_eval/__main__.py#L26
    optional_arguments.add_argument(
        "--tasks",
        default=None,
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    optional_arguments.add_argument(
        "--num_fewshot",
        type=int,
        default=None,
        help="Number of examples in few-shot context",
    )
    optional_arguments.add_argument(
        "--output_path",
        default=None,
        type=str,
        metavar="= [dir/file.jsonl] [DIR]",
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    optional_arguments.add_argument(
        "--limit",
        type=float,
        default=None,
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
    optional_arguments.add_argument(
        "--use_cache",
        type=str,
        default=None,
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    optional_arguments.add_argument(
        "--check_integrity",
        action="store_true",
        help="Whether to run the relevant part of the test suite for the tasks",
    )
    optional_arguments.add_argument(
        "--write_out",
        action="store_true",
        default=False,
        help="Prints the prompt for the first few documents",
    )
    optional_arguments.add_argument(
        "--log_samples",
        action="store_true",
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
    )
    optional_arguments.add_argument(
        "--show_config",
        action="store_true",
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
    optional_arguments.add_argument(
        "--include_path",
        type=str,
        default=None,
        help="Additional path to include if there are external tasks to include.",
    )
    # CEREBRAS-SPECIFIC ARGS
    optional_arguments.add_argument(
        "--hf_cache_dir",
        default=None,
        help=("Path to directory for caching Hugging Face downloaded data."),
    )
    optional_arguments.add_argument(
        "--keep_data_dir",
        action="store_true",
        default=False,
        help=(
            "Specifies whether dumped data samples should be kept for reuse. "
            "Defaults to False, i.e. data samples are deleted after the run."
        ),
    )

    return parser


[docs]def run_eval_harness():
    """Main run script."""
    parent = inspect.getouterframes(inspect.currentframe())[1]
    run_dir = os.path.dirname(os.path.abspath(parent.filename))
    parser_fn = lambda: [eeh_parser()]
    parser_args = {
        "parser_epilog": (
            "Please run 'python run_cstorch_eval_harness.py CSX -h'. \n \n"
            "Here is an example command for running on CSX: \n \n"
            "    python run_cstorch_eval_harness.py CSX --params /path/to/params --checkpoint_path "
            "/path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 \n \n"
            "Note that Eval Harness is currently only supported for device CSX"
        ),
        "csx_parser_epilog": (
            "To see a complete list of all available arguments, \n"
            "please run 'python run_cstorch_eval_harness.py CSX -h'. \n\n"
            "Here is an example command for running with CSX: \n \n"
            "    python run_cstorch_eval_harness.py CSX --params /path/to/params "
            "--checkpoint_path /path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 "
            "\n \nEval Harness resides in the Cerebras Model Zoo. Please specify --python_paths and "
            "\n --mount_dirs here or in your params.yaml under the 'runconfig' section with \n"
            "the path to the directory in which the Cerebras Model Zoo resides. \n"
        ),
        "modes": ["eval"],
    }

    # Parse args
    params = get_params_from_args(
        run_dir,
        argv=sys.argv[1:],
        extra_args_parser_fn=parser_fn,
        device_type=DeviceType.CSX,
        **parser_args,
    )
    runconfig_params = params["runconfig"]
    from lm_eval.api.registry import get_model

    from cerebras.modelzoo.common.eval_harness_impl import CS_LLM
    from cerebras.modelzoo.common.pytorch_utils import (
        RunConfigParamsValidator,
        setup_artifact_dir,
        setup_logging,
    )
    from cerebras.modelzoo.data.nlp.gpt.InferenceDataProcessor import (
        RequestType,
    )

    # Set default model parameters
    from cerebras.modelzoo.models.nlp.gpt2.utils import set_defaults

    set_defaults(params)

    # Validate runconfig
    RunConfigParamsValidator(parser_fn).validate(runconfig_params)

    # Validate input params
    if params.get("eval_input") is not None:
        num_pt_workers = params["eval_input"].get("num_workers")
        if num_pt_workers is not None and num_pt_workers > 1:
            raise ValueError(
                "Eval harness does not support multiple process data "
                "loading for `eval_input.num_workers` > 1, but specified "
                f"{num_pt_workers} worker processes.\nPlease ensure that "
                "`eval_input.num_workers` is either 0 (default) or 1."
            )
    else:
        raise RuntimeError(
            "No `eval_input` section specified in the .yaml config."
        )

    # Set up logging level and env vars
    artifact_dir = setup_artifact_dir(runconfig_params["model_dir"], "eval")
    setup_logging(
        runconfig_params.get("logging"),
        runconfig_params.get("streamer_logging"),
        logging_dir=artifact_dir,
        model_dir=runconfig_params.get("model_dir"),
    )
    setup_hf_env_vars(hf_cache_dir=runconfig_params.get("hf_cache_dir"))

    # Debug logs
    logging.debug(f"CMD: {sys.argv}")
    logging.debug(f"Runconfig: {runconfig_params}")

    # Construct args namespace object for EEH's main script
    parser = parser_fn()[0]
    args = {}
    for arg in parser._action_groups[0]._actions:
        arg_name = arg.dest
        # Exclude Cerebras-specific args
        if arg_name in {"hf_cache_dir", "keep_data_dir"}:
            continue
        else:
            arg_val = runconfig_params.get(arg_name)
        args[arg_name] = arg_val

    from cerebras.modelzoo.data.nlp.gpt.InferenceDataProcessor import (
        InferenceDataProcessor,
    )
    from cerebras.modelzoo.models.nlp.gpt2.model import (
        Gpt2Model,
        GptInferenceModel,
    )

    def config_validation(params, model_key):
        # EEH-specific params added to the runconfig are not supported by our config class validation.
        # We remove EEH args from the runconfig, perform config validation and then re-add the args
        extra_parser_param_keys = []
        if parser:
            if parser and isinstance(parser, argparse.ArgumentParser):
                extra_parser_param_keys.extend(
                    [
                        action.dest
                        for action in parser._actions
                        if not isinstance(action, argparse._HelpAction)
                    ]
                )
        run_params = params["runconfig"]
        extra_parser_params = {}
        for eeh_arg in extra_parser_param_keys:
            if eeh_arg in run_params:
                extra_parser_params[eeh_arg] = run_params.pop(eeh_arg, None)
        # validate the params with config class
        validate_config_params(params, model_key)
        # Re-add extra EEH args to the runconfig after config validation
        run_params.update(extra_parser_params)

    def model_fn(request_type, params):
        if request_type == RequestType.loglikelihood:
            # TODO : params here contain start_token etc which are only part of inference model.
            # If we use gpt2 model, validation will fail. We need to clean up params to
            # contain start_token etc only when inference is used
            config_validation(params, "gpt_inference")
            return Gpt2Model(params)
        elif request_type == RequestType.generate_until:
            config_validation(params, "gpt_inference")
            return GptInferenceModel(params)
        else:
            raise TypeError(
                f"Invalid request type: {request_type}. At present, only "
                "`RequestType.loglikelihood` and `RequestType.generate_until` "
                "request types are supported."
            )

    def eval_input_fn(params, samples_file_list, dataset_size, request_type):
        return InferenceDataProcessor.from_request_type(
            request_type,
            params["eval_input"],
            samples_file_list,
            dataset_size,
        ).create_dataloader()

    lm = get_model(CS_LLM).create_from_arg_string(
        arg_string="",
        additional_config={
            "params": params,
            "model_fn": model_fn,
            "input_fn": eval_input_fn,
            "data_fn": InferenceDataProcessor.gen_data_samples,
            "artifact_dir": artifact_dir,
        },
    )

    # These are additional EEH args that we don't expose in our parser
    additional_args = {
        "model": lm,
        "verbosity": "INFO",  # EEH logging level
        "model_args": None,
        "batch_size": None,
        "max_batch_size": None,
        "device": None,
        "decontamination_ngrams_path": None,
        "gen_kwargs": None,
    }

    final_args = {**args, **additional_args}
    args_namespace = argparse.Namespace(**final_args)

    # Invoke EEH script
    from lm_eval.__main__ import cli_evaluate

    cli_evaluate(args=args_namespace)


if __name__ == "__main__":
    run_eval_harness()