Source code for cerebras.modelzoo.trainer.extensions.eleuther.eval_harness_utils

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Defines utils for running Eval Harness on CSX."""

import glob
import json
import os
import re
import sys
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
from warnings import warn

import numpy as np
from lm_eval import evaluator, utils
from lm_eval.__main__ import _int_or_none_list_arg_type
from lm_eval.api.model import LM
from lm_eval.api.task import Task
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import handle_non_serializable, make_table

from cerebras.appliance.log import ClassLogger, named_class_logger

DEFAULT_RESULTS_FILE = "results.json"

SUPPORTED_MODELS = {
    "btlm",
    "bloom",
    "gpt2",
    "gptj",
    "falcon",
    "gpt3",
    "gpt-neox",
    "llama",
    "mistral",
    "mixtral",
    "mpt",
    "jais",
    "santacoder",
    "starcoder",
}


[docs]@dataclass class EleutherCLIArgs: """Captures EEH's CLI arguments with defaults. Fields: tasks: List of tasks to evaluate To get full list of tasks, use the command ``lm-eval --tasks list`` num_fewshot: Number of examples in few-shot context output_path: The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used. limit: Limit the number of examples per task. If <1, limit is a percentage of the total number of examples. use_cache: A path to a sqlite db file for caching model responses. `None` if not caching. cache_requests: Speed up evaluation by caching the building of dataset requests. `None` if not caching. check_integrity: Whether to run the relevant part of the test suite for the tasks. write_out: Prints the prompt for the first few documents. log_samples: If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path. system_instruction: System instruction to be used in the prompt apply_chat_template: If True, applies the chat template to the prompt fewshot_as_multiturn: If True, uses the fewshot as a multi-turn conversation show_config: If True, shows the the full config of all tasks at the end of the evaluation. include_path: Additional path to include if there are external tasks to include. predict_only: Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated. seed: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or ``None`` to not set the seed. Default is ``0,1234,1234`` (for backward compatibility). E.g. ``--seed 0,None,8`` sets ``random.seed(0)`` and ``torch.manual_seed(8)``. Here numpy's seed is not set since the second value is ``None``. E.g, ``--seed 42`` sets all three seeds to 42. trust_remote_code: Sets trust_remote_code to True to execute code to create HF Datasets from the Hub verbosity: EEH logging level max_tokens: Maximum number of tokens to generate. temperature: Sampling temperature used for generation. top_k: Top-k parameter used for generation. top_p: Top-p parameter used for nucleus sampling. """ tasks: Union[str, List[str]] num_fewshot: Optional[int] = None output_path: Optional[str] = None limit: Optional[float] = None use_cache: Optional[str] = None cache_requests: Optional[Literal["true", "refresh", "delete"]] = None check_integrity: bool = False write_out: bool = False log_samples: bool = False system_instruction: Optional[str] = None apply_chat_template: bool = False fewshot_as_multiturn: bool = False show_config: bool = False include_path: Optional[str] = None predict_only: bool = False seed: Union[int, str] = "0,1234,1234,1234" trust_remote_code: bool = False verbosity: str = "INFO" max_tokens: Optional[int] = None temperature: Optional[float] = None top_k: Optional[int] = None top_p: Optional[float] = None def __post_init__(self): """Specially handle the seed.""" # Special handling of `seed` arg self.seed = _int_or_none_list_arg_type( min_len=3, max_len=4, defaults="0,1234,1234,1234", value=str(self.seed), )
@named_class_logger("EvalHarnessRunner") class EvalHarnessRunner(ClassLogger): """Util class for invoking EEH's run script with CSX-specific components.""" def __init__(self, eeh_args: EleutherCLIArgs): """ Args: eeh_args: Eval Harness CLI args. """ super().__init__() self.args = deepcopy(eeh_args) self.task_manager: TaskManager = None self.task_names: Union[str, List[Union[str, Dict, Task]]] = [] self.init_tasks() def init_tasks(self): # pylint: disable=line-too-long """Captures the task initialization logic from `Eleuther's run script <lm_eval_main>`_. .. _lm_eval_main: https://github.com/EleutherAI/lm-evaluation-harness/blob/4600d6bf73ba2cf7037ae7feada03315839ef185/lm_eval/__main__.py#L271-L307 Includes CSX-specific validation for the user-specified eval harness tasks. """ if self.args.include_path is not None: self.logger.info( f"Including path: {self.args.include_path} for externally created tasks." ) task_manager = TaskManager( self.args.verbosity, include_path=self.args.include_path ) if self.args.limit: self.logger.warning( " --limit should only be used for testing. " "Real metrics should not be computed using limit." ) if self.args.tasks is None: raise ValueError("Need to specify task to evaluate.") elif self.args.tasks == "list": self.logger.info( "Available Tasks:\n - {}".format( "\n - ".join(task_manager.all_tasks) ) ) sys.exit() else: if os.path.isdir(self.args.tasks): task_names = [] yaml_path = os.path.join(self.args.tasks, "*.yaml") for yaml_file in glob.glob(yaml_path): self.logger.info(f"Loading task from file: {yaml_file}") config = utils.load_yaml_config(yaml_file) task_names.append(config) else: task_list = self.args.tasks.split(",") task_names = task_manager.match_tasks(task_list) for task in [ task for task in task_list if task not in task_names ]: if os.path.isfile(task): config = utils.load_yaml_config(task) task_names.append(config) task_missing = [ task for task in task_list if task not in task_names and "*" not in task ] # we don't want errors if a wildcard ("*") task name was used if task_missing: missing = ", ".join(task_missing) raise ValueError( f"Tasks not found: {missing}.\n" f"{utils.SPACING}Try `lm-eval --tasks list` for list " "of available tasks, or '--verbosity DEBUG' to " "troubleshoot task registration issues." ) # Validate tasks and cache task related properties. self.task_names = EvalHarnessRunner.validate_and_sort_tasks( task_names, task_manager ) self.task_manager = task_manager @cached_property def task_dict(self) -> Dict[str, Any]: """Returns the task dictionary for the specified tasks.""" return get_task_dict(self.task_names, self.task_manager) @staticmethod def validate_and_sort_tasks( task_names: Union[str, List[Union[str, Dict, Task]]], task_manager: Optional[TaskManager] = None, ) -> None: """Validates user specification of eval harness tasks on CSX. In particular, for a single run we do not support. 1) Tasks with `loglikelihood_rolling` output types 2) Combining non-generative and generative tasks 3) Running multiple generative tasks Args: task_names: List of task names or config dicts task_manager: TaskManager object that stores indexed tasks """ task_dict = get_task_dict(task_names, task_manager) gen_tasks, non_gen_tasks = [], [] for task_name in task_dict.keys(): task_obj = task_dict[task_name] if isinstance(task_obj, tuple): _, task_obj = task_obj if task_obj is None: continue if task_obj.get_config("output_type") == "loglikelihood_rolling": raise RuntimeError( "Tasks with `loglikelihood_rolling` output types are not yet supported." f"Please unspecify task {task_name} from the specified tasks list." ) elif task_obj.get_config("output_type") == "generate_until": gen_tasks.append(task_name) else: non_gen_tasks.append(task_name) # Put non generative task names after generative so EH will execute them first. # This is needed minimize the amount of appliance restarts, so train->non_generative # will use the same appliance. return gen_tasks + non_gen_tasks def evaluate(self, trainer, model: LM) -> dict: # pylint: disable=line-too-long """Invoke's evaluation logic from `EEH's run script <lm_eval_main>`_ on the given model. .. _lm_eval_main: https://github.com/EleutherAI/lm-evaluation-harness/blob/4600d6bf73ba2cf7037ae7feada03315839ef185/lm_eval/__main__.py#L240 Args: trainer: The Trainer object to log to. model: The language model object (subclass of EEH's LM abstract base class) """ if self.args.predict_only: self.args.log_samples = True if ( self.args.log_samples or self.args.predict_only ) and not self.args.output_path: self.args.output_path = ( trainer.summary_dir / trainer.name_scope_path ) if self.args.output_path: path = Path(self.args.output_path) if not path.is_absolute(): path = trainer.summary_dir / trainer.name_scope_path / path if path.is_dir(): path.mkdir(parents=True, exist_ok=True) else: path.parent.mkdir(parents=True, exist_ok=True) # check if file or 'dir/results.json' exists if path.is_file(): raise FileExistsError(f"File already exists at {path}") output_path_file = path.joinpath(DEFAULT_RESULTS_FILE) output_path_file = ( output_path_file.parent / f"{output_path_file.stem}_{trainer.global_step}{output_path_file.suffix}" ) if output_path_file.is_file(): self.logger.warning( f"File {output_path_file} already exists. Results will be overwritten." ) # if path json then get parent dir elif path.suffix in (".json", ".jsonl"): output_path_file = path path.parent.mkdir(parents=True, exist_ok=True) path = path.parent else: output_path_file = output_path_file.resolve() path = path.resolve() path.mkdir(parents=True, exist_ok=True) self.logger.info( f"Starting Eleuther evaluation harness on selected tasks: {self.task_names}" ) request_caching_args = evaluator.request_caching_arg_to_dict( cache_requests=self.args.cache_requests ) # Set generative inference settings gen_kwargs = { "temperature": self.args.temperature, "top_k": self.args.top_k, "top_p": self.args.top_p, "max_tokens": self.args.max_tokens, } model.gen_kwargs = gen_kwargs results = evaluator.simple_evaluate( model=model, tasks=self.task_names, num_fewshot=self.args.num_fewshot, use_cache=self.args.use_cache, limit=self.args.limit, check_integrity=self.args.check_integrity, write_out=self.args.write_out, log_samples=self.args.log_samples, system_instruction=self.args.system_instruction, apply_chat_template=self.args.apply_chat_template, fewshot_as_multiturn=self.args.fewshot_as_multiturn, task_manager=self.task_manager, verbosity=self.args.verbosity, predict_only=self.args.predict_only, random_seed=self.args.seed[0], numpy_random_seed=self.args.seed[1], torch_random_seed=self.args.seed[2], fewshot_random_seed=self.args.seed[3], **request_caching_args, ) if results is not None: if self.args.log_samples: samples = results.pop("samples") dumped = json.dumps( results, indent=2, default=handle_non_serializable, ensure_ascii=False, ) if self.args.show_config: self.logger.info(dumped) batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) batch_size = None model_args = None try: self.log_eval_results(trainer, results) if self.args.log_samples: self.log_eval_samples(trainer, samples, results) except Exception as e: # pylint: disable=broad-except self.logger.error( f"Logging eval results/samples failed due to: {e}" ) if self.args.output_path is not None: self.logger.info( f"Saving Eleuther Eval Harness results to {output_path_file}" ) with output_path_file.open("w", encoding="utf-8") as f: f.write(dumped) if self.args.log_samples: for task_name, _ in results["configs"].items(): filename = path.joinpath( f"{task_name}_{trainer.global_step}.json" ) samples_dumped = json.dumps( samples[task_name], indent=2, default=handle_non_serializable, ensure_ascii=False, ) filename.write_text(samples_dumped, encoding="utf-8") self.logger.info( f"{model} ({model_args}), gen_kwargs: ({gen_kwargs}), " f"limit: {self.args.limit}, num_fewshot: {self.args.num_fewshot}, " f"batch_size: {batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" ) self.logger.info("\n" + make_table(results)) if "groups" in results: self.logger.info("\n" + make_table(results, "groups")) def log_eval_results(self, trainer, results: Dict[str, Any]) -> None: """Logs the evaluation results to the trainer.""" results = deepcopy(results) # TODO: Do we need to update the wandb config? # configs = { # "task_configs": results.get("configs", {}), # "cli_configs": results.get("config", {}), # } # wandb.run.config.update(configs) pattern = re.compile(r",none$") # Log the evaluation metrics trainer.log_metrics( **{ # Remove None from the metric string name pattern.sub("", f"{task_name}/{metric_name}"): metric_value for task_name, task_value in results.get("results", {}).items() for metric_name, metric_value in task_value.items() } ) self.log_eval_results_as_table(trainer, results) # Log the results dict as json self.log_as_json(trainer, "eval_results", results) def log_eval_results_as_table( # pylint: disable=line-too-long self, trainer, results: Dict[str, Any] ) -> None: """Logs the eval results as a table to the trainer's loggers. Note, this method is adapted to construct a pandas DataFrame off the `original WandB specific implementation <log_table>`_ in EEH. .. _log_table: https://github.com/EleutherAI/lm-evaluation-harness/blob/3fa4fd725c8a428710109f1d6c14eda37e95baea/lm_eval/loggers/wandb_logger.py#L112-L160 """ try: import pandas as pd except ImportError: warn("Pandas not installed. Skipping logging of results as table.") return group_names = list(results.get("groups", {})) def make_dataframe(column1: str, key: str = "results"): data = [] for k, dic in results.get(key).items(): if k in group_names and key != "groups": continue version = results.get("versions").get(k) if version == "N/A": version = None num_fewshot = results.get("n-shot").get(k) for metric_filter, value in dic.items(): # pylint: disable=redefined-builtin metric, _, filter = metric_filter.partition(",") if metric.endswith("_stderr") or metric == "alias": continue if f"{metric}_stderr,{filter}" in dic: stderr = dic[f"{metric}_stderr,{filter}"] if stderr != "N/A": stderr = f"{stderr:.4f}" else: stderr = "" data.append( { column1: k, "Version": version, "Filter": filter, "num_fewshot": num_fewshot, "Metric": metric, "Value": str(value), "Stderr": str(stderr), } ) return pd.DataFrame(data=data) if "results" in results: trainer.log_metrics( **{ "evaluation/eval_results": make_dataframe( "Tasks", "results" ) } ) if "groups" in results: trainer.log_metrics( **{ "evaluation/group_eval_results": make_dataframe( "Groups", "groups" ) } ) def log_as_json(self, trainer, key, results: Dict[str, Any]): """Serializes the results dict as json and logs it to the trainer.""" def _handle_non_serializable(o: Any) -> Union[int, str, list]: if isinstance(o, (np.int32, np.int64)): return int(o) elif isinstance(o, set): return list(o) else: return str(o) trainer.log_metrics( **{ key: json.dumps( results, indent=4, default=_handle_non_serializable, ensure_ascii=False, ) } ) def log_eval_samples( self, trainer, samples: Dict[str, Any], results: Dict[str, Any] ) -> None: """Logs the evaluation samples to the trainer.""" try: import pandas as pd except ImportError: warn("Pandas not installed. Skipping logging of eval samples") return samples = deepcopy(samples) def generate_dataset(*args, **kwargs) -> pd.DataFrame: from lm_eval.loggers import WandbLogger # Its okay to pass in `None` as self as this method # has no self uses # pylint: disable=protected-access return WandbLogger._generate_dataset(None, *args, **kwargs) group_names = list(results.get("groups", {})) task_names = [ x for x in results.get("results", {}) if x not in group_names ] ungrouped_tasks = [] tasks_by_groups = defaultdict(list) task_configs = results.get("configs", {}) for task_name in task_names: group_names = task_configs[task_name].get("group", None) if group_names: if isinstance(group_names, str): group_names = [group_names] for group_name in group_names: tasks_by_groups[group_name].append(task_name) else: ungrouped_tasks.append(task_name) for task_name in ungrouped_tasks: eval_preds = samples[task_name] trainer.log_metrics( **{ # log the samples as a table f"{task_name}_eval_results": generate_dataset( eval_preds, task_configs.get(task_name), ), } ) # Log the samples dict as json self.log_as_json(trainer, f"{task_name}_eval_samples", eval_preds) for group, grouped_tasks in tasks_by_groups.items(): grouped_df = pd.DataFrame() for task_name in grouped_tasks: eval_preds = samples[task_name] df = generate_dataset(eval_preds, task_configs.get(task_name)) df["group"] = group df["task"] = task_name grouped_df = pd.concat([grouped_df, df], ignore_index=True) # Log the samples dict as json self.log_as_json( trainer, f"{task_name}_eval_samples", eval_preds ) trainer.log_metrics(**{f"{group}_eval_results": grouped_df})