Source code for cerebras.modelzoo.common.run_eleuther_eval_harness

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Eval Harness run script."""

import argparse
import logging
import sys
from copy import deepcopy
from warnings import warn

# isort: off
import os

sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
# isort: on
from cerebras.modelzoo.common.utils.run.cli_parser import get_params_from_args
from cerebras.modelzoo.common.utils.run.utils import DeviceType
from cerebras.modelzoo.trainer.extensions.eleuther.eval_harness_utils import (
    SUPPORTED_MODELS,
)
from cerebras.modelzoo.trainer.utils import (
    configure_trainer_from_config,
    convert_legacy_params_to_trainer_params,
    inject_cli_args_to_trainer_params,
    is_legacy_params,
)
from cerebras.modelzoo.trainer.validate import validate_trainer_params


[docs]def eeh_parser():
    parser = argparse.ArgumentParser(
        "Script for running Eleuther Eval Harness for GPT style models",
        add_help=False,
    )
    optional_arguments = parser.add_argument_group(
        "Eleuther Eval Harness Arguments"
    )
    # EEH-SPECIFIC ARGS
    # Ref: https://github.com/EleutherAI/lm-evaluation-harness/blob/c9bbec6e7de418b9082379da82797522eb173054/lm_eval/__main__.py#L26
    optional_arguments.add_argument(
        "--tasks",
        "-t",
        default=None,
        type=str,
        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    optional_arguments.add_argument(
        "--num_fewshot",
        "-f",
        type=int,
        default=None,
        metavar="N",
        help="Number of examples in few-shot context",
    )
    optional_arguments.add_argument(
        "--output_path",
        default=None,
        type=str,
        metavar="DIR|DIR/file.json",
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    optional_arguments.add_argument(
        "--limit",
        "-L",
        type=float,
        default=None,
        metavar="N|0<N<1",
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
    optional_arguments.add_argument(
        "--use_cache",
        type=str,
        default=None,
        metavar="DIR",
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    optional_arguments.add_argument(
        "--cache_requests",
        type=str,
        default=None,
        choices=["true", "refresh", "delete"],
        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
    )
    optional_arguments.add_argument(
        "--check_integrity",
        action="store_true",
        default=None,
        help="Whether to run the relevant part of the test suite for the tasks.",
    )
    optional_arguments.add_argument(
        "--write_out",
        "-w",
        action="store_true",
        default=None,
        help="Prints the prompt for the first few documents.",
    )
    optional_arguments.add_argument(
        "--log_samples",
        "-s",
        action="store_true",
        default=None,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
    parser.add_argument(
        "--system_instruction",
        type=str,
        default=None,
        help="System instruction to be used in the prompt",
    )
    parser.add_argument(
        "--apply_chat_template",
        action="store_true",
        default=False,
        help="If True, applies the chat template to the prompt",
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
        action="store_true",
        default=False,
        help="If True, uses the fewshot as a multi-turn conversation",
    )
    optional_arguments.add_argument(
        "--show_config",
        action="store_true",
        default=None,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
    optional_arguments.add_argument(
        "--include_path",
        type=str,
        default=None,
        metavar="DIR",
        help="Additional path to include if there are external tasks to include.",
    )
    optional_arguments.add_argument(
        "--predict_only",
        "-x",
        action="store_true",
        default=None,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
    optional_arguments.add_argument(
        "--seed",
        default=None,
        help=(
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
            "respectively, or a single integer to set the same seed for all four.\n"
            f"The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234,1234` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
            "Here numpy's seed is not set since the second value is `None`.\n"
            "E.g, `--seed 42` sets all four seeds to 42."
        ),
    )
    optional_arguments.add_argument(
        "--trust_remote_code",
        default=None,
        action="store_true",
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
    # NONGREEDY SAMPLING ARGS FOR GENERATIVE TASKS
    optional_arguments.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Sampling temperature used for generation.",
    )
    optional_arguments.add_argument(
        "--top_p",
        type=float,
        default=None,
        help="Top-p parameter used for nucleus sampling.",
    )
    optional_arguments.add_argument(
        "--top_k",
        type=int,
        default=None,
        help="Top-k parameter used for generation.",
    )
    # CEREBRAS-SPECIFIC ARGS
    optional_arguments.add_argument(
        "--keep_data_dir",
        action="store_true",
        default=False,
        help=(
            "Specifies whether dumped data samples should be kept for reuse. "
            "Defaults to False, i.e. data samples are deleted after the run."
        ),
    )

    return parser


[docs]def run_eval_harness():
    """Main run script."""
    parser_fn = lambda: [eeh_parser()]
    parser_args = {
        "parser_epilog": (
            "Please run 'python run_eleuther_eval_harness.py CSX -h'. \n \n"
            "Here is an example command for running on CSX: \n \n"
            "    python run_eleuther_eval_harness.py CSX --params /path/to/params --checkpoint_path "
            "/path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 \n \n"
            "Note that Eval Harness is currently only supported for device CSX"
        ),
        "csx_parser_epilog": (
            "To see a complete list of all available arguments, \n"
            "please run 'python run_eleuther_eval_harness.py CSX -h'. \n\n"
            "Here is an example command for running with CSX: \n \n"
            "    python run_eleuther_eval_harness.py CSX --params /path/to/params "
            "--checkpoint_path /path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 "
            "\n \nEval Harness resides in the Cerebras Model Zoo. Please specify --python_paths and "
            "\n --mount_dirs here or in your params.yaml under the 'runconfig' section with \n"
            "the path to the directory in which the Cerebras Model Zoo resides. \n"
        ),
        "modes": ["eval"],
    }

    # Parse args
    params = get_params_from_args(
        argv=sys.argv[1:],
        extra_args_parser_fn=parser_fn,
        device_type=DeviceType.CSX,
        **parser_args,
    )
    runconfig_params = params["runconfig"]

    parser = parser_fn()[0]
    eeh_args = {}
    other_eeh_args = {}
    for arg in parser._action_groups[0]._actions:
        arg_name = arg.dest
        # Exclude Cerebras-specific args
        if arg_name in {"keep_data_dir"}:
            other_eeh_args[arg_name] = runconfig_params.pop(
                arg_name, arg.default
            )
        elif arg_name in runconfig_params:
            arg_val = runconfig_params.pop(arg_name, None)
            if arg_val is not None:  # Only consider specified CLI args
                eeh_args[arg_name] = arg_val

    if is_legacy_params(params):
        warn(
            f"Detected that legacy params are being used. "
            f"Automatically converting params to new format."
        )
        params = convert_legacy_params_to_trainer_params(
            params,
            # Allow None objects inside the params
            obj_filter=lambda obj: obj is None,
        )

        # Convert ScopedValidateFlags to ScopedEleutherEvalHarnessFlags
        for callback in params["trainer"]["init"].get("callbacks", []):
            if "ScopedValidateFlags" in callback:
                callback["ScopedEleutherEvalHarnessFlags"] = callback.pop(
                    "ScopedValidateFlags"
                )

        # Add EleutherEvalHarness callback to the list of callbacks
        dataloader_args = (
            params["trainer"].get("validate_all", {}).pop("val_dataloaders", {})
        )
        dataloader_args["data_processor"] = "InferenceDataProcessor"
        params["trainer"]["init"]["callbacks"].append(
            {
                "EleutherEvalHarness": {
                    "eeh_args": eeh_args,
                    **deepcopy(other_eeh_args),
                    **deepcopy(dataloader_args),
                }
            }
        )

        # Remove fit/validate keys that are not used in the standalone flow
        for key in ("fit", "validate"):
            params["trainer"].pop(key, None)

    elif "runconfig" in params:
        params = inject_cli_args_to_trainer_params(
            params.pop("runconfig"), params
        )

    if "trainer" not in params:
        raise KeyError(
            "Trainer configuration not found in params. "
            "Please ensure that the params contain a 'trainer' key."
        )

    if isinstance(params["trainer"], (list, tuple)):
        raise ValueError(
            "Standalone Eleuther evaluation harness script only supports "
            "a single trainer instance, but found a list of trainers."
        )

    # Extract EleutherEvalHarness callbacks from the list of callbacks
    eeh_callbacks = [
        callback["EleutherEvalHarness"]
        for callback in params["trainer"]["init"].get("callbacks", [])
        if "EleutherEvalHarness" in callback
    ]
    if not eeh_callbacks:
        raise RuntimeError(f"Found no EleutherEvalHarness callback")

    for eeh_callback in eeh_callbacks:
        eeh_callback.setdefault("eeh_args", {}).update(
            (key, value) for key, value in deepcopy(eeh_args).items()
        )

    config = validate_trainer_params(params)[0]

    if config.init.model.config.name not in SUPPORTED_MODELS:
        raise ValueError(
            f"Eleuther evaluation is not supported for model {config.init.model.config.name}. "
            f"Please choose a valid model name from: {SUPPORTED_MODELS}"
        )

    trainer = configure_trainer_from_config(config, mode="eval_all")

    kwargs = {}
    if config.validate_all:
        if config.validate_all.val_dataloaders:
            logging.warning(
                f"Found `validate_all.val_dataloaders` specified in the yaml, "
                f"but no upstream validation will be run for the standalone "
                f"Eleuther Eval Harness script."
            )
        kwargs = {"ckpt_paths": config.validate_all.ckpt_paths}

    trainer.validate_all(**kwargs)


if __name__ == "__main__":
    run_eval_harness()