Source code for cerebras.modelzoo.data.common.input_utils

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import random
from typing import Iterator, Sized

import numpy as np
import torch

import cerebras.pytorch as cstorch
import cerebras.pytorch.distributed as dist
from cerebras.pytorch.distributed.cluster_resolver import ClusterSpec, TaskSpec


[docs]def get_data_for_task(
    task_id,
    meta_data_values_cum_sum,
    num_examples_per_task,
    meta_data_values,
    meta_data_filenames,
):
    """
    Function to get distribute files with given number of examples such that each
    distributed task has access to exactly the same number of examples

    Args:
        task_id (int): Integer id for a task.
        meta_data_values_cum_sum (int): Cumulative sum of the file sizes in
            lines from meta data file.
        num_examples_per_task (int): Number of the examples specified per
            slurm task. Equal to `batch_size` * `num_batch_per_task`.
        meta_data_values (list[int]): List of the files sizes in lines in the
            meta data file.
        meta_data_filenames (list[str]): List with file names in the meta data
            file.

    Returns:
        list of tuples of length 3. The tuple contains at
        - index 0: filepath.
        - index 1: number of examples to be considered for this task_id.
        - index 2: start index in the file from where these
                    examples should be considered
        The list represents the files that should be considered for this task_id.
    """
    files_in_task = []

    # file where the split starts
    file_start_idx = np.min(
        np.where(meta_data_values_cum_sum > task_id * num_examples_per_task)[0]
    )
    # Index in file from where the examples should be considered for this task
    start_idx = (
        task_id * num_examples_per_task
        - meta_data_values_cum_sum[file_start_idx - 1]
        # -1 since len(`meta_data_values_cum_sum`) = len(`meta_data_values`) + 1
    )

    # Number of examples to pick from this file.
    # We do a `min` to handle a case where the file has
    # examples > num_examples_per_task
    num_examples = min(
        meta_data_values[file_start_idx - 1] - start_idx,
        num_examples_per_task,
    )
    files_in_task.append(
        (
            meta_data_filenames[file_start_idx - 1],
            num_examples,
            start_idx,
        )  # (file_path, num_examples, start_index)
    )

    if num_examples != num_examples_per_task:
        # If the file has fewer number of examples than
        # `num_examples_per_task`, continue through files
        # till we reach our required number of examples.

        indices = np.where(
            meta_data_values_cum_sum > (task_id + 1) * num_examples_per_task
        )[0]
        if indices.size != 0:
            file_end_idx = np.min(indices)
        else:
            file_end_idx = len(meta_data_values_cum_sum)

        for i in range(file_start_idx + 1, file_end_idx):
            files_in_task.append(
                (
                    meta_data_filenames[i - 1],
                    meta_data_values[i - 1],
                    0,
                )  # (file_path, num_examples, start_index)
            )

        # If the number of examples needed to fulfill
        # `num_examples_per_task`, falls in between a file
        num_end_examples = (
            task_id + 1
        ) * num_examples_per_task - meta_data_values_cum_sum[file_end_idx - 1]
        if num_end_examples > 0:
            files_in_task.append(
                (
                    meta_data_filenames[file_end_idx - 1],
                    num_end_examples,
                    0,
                )  # (file_path, num_examples, start_index)
            )

    assert (
        sum([num_examples for _, num_examples, _ in files_in_task])
        == num_examples_per_task
    ), f"Incorrect number of examples in the split with task_id {task_id}"

    return files_in_task


[docs]def is_distributed():
    """
    Returns True if DDP is enabled.
    """
    return (
        torch.distributed.is_available() and torch.distributed.is_initialized()
    )


[docs]def task_id():
    if dist.is_streamer():
        return dist.get_streaming_rank()
    elif is_distributed():
        return dist.get_rank()
    else:
        return 0


[docs]def num_tasks():
    if dist.is_streamer():
        return dist.num_streamers()
    elif is_distributed():
        return dist.get_world_size()
    else:
        return 1


[docs]def cluster_config():
    """
    Returns (ClusterSpec, TaskSpec). The TaskSpec contains the following fields:
        - rank: the global rank of the current worker
        - local_rank: the rank of the current worker among workers who feed
            the same system as the current worker
        - wse_id: the index of the system that the current worker is
            associated with
    The ClusterSpec contains the following fields:
        - tasks: a list of TaskSpecs for each task running on the cluster
        - rank: the rank of the current process's task in the cluster
        - num_csx: the number of CSX systems in the cluster
        - num_workers_per_csx: the number of worker tasks per CSX
    If the current job is running on GPU instead of CS system, then
    the ranks and world sizes in the returned TaskSpec will be set to the GPU
    rank and world size.

    """
    if cstorch.use_cs() and dist.is_streamer():
        cluster_spec = dist.service_resolver().cluster_spec
        task_spec = cluster_spec.task()
        return cluster_spec, task_spec
    elif is_distributed():
        task_spec = TaskSpec(
            rank=dist.get_rank(),
            local_rank=dist.get_rank(),
            wse_id=0,
            node_name="unknown",
        )
        cluster_spec = ClusterSpec(
            [task_spec],
            dist.get_rank(),
            1,
            dist.get_world_size(),
        )
        return cluster_spec, task_spec
    else:
        task_spec = TaskSpec(
            rank=0, local_rank=0, wse_id=0, node_name="unknown"
        )
        cluster_spec = ClusterSpec([task_spec], 0, 1, 1)
        return cluster_spec, task_spec


[docs]class ShardedSampler(torch.utils.data.Sampler):
    """
    Modified from:
    https://pytorch.org/docs/stable/_modules/torch/utils/data/distributed.html#DistributedSampler
    Sampler that restricts data loading to a subset of the dataset.

    Dataset is assumed to be of constant size.

    Args:
        dataset (torch.utils.data.Dataset): Dataset used for sampling.
        shuffle (bool, optional): If `True` (default), sampler will shuffle
            the indices.
        seed (int, optional): Random seed used to shuffle the sampler if
            :attr:`shuffle=True`. This number should be identical across all
            processes in the distributed group. Default: `0`.
        drop_last (bool, optional): If `True`, then the sampler will drop the
            tail of the data to make it evenly divisible across the number of
            replicas. If `False`, the sampler will add extra indices to make
            the data evenly divisible across the replicas. Default: `False`.
    """

[docs]    def __init__(self, dataset, shuffle=True, seed=None, drop_last=False):
        self.num_tasks = num_tasks()
        self.task_id = task_id()
        self.dataset = dataset
        self.dataset_len = len(self.dataset)
        self.drop_last = drop_last

        if cstorch.use_cs() and not self.drop_last:
            raise ValueError(
                "On CS2 we do not support unequal batch sizes so `drop_last` "
                "must be set to `True`."
            )
        # If the dataset length is evenly divisible by # of replicas, then there
        # is no need to drop any data, since the dataset will be split equally.
        if self.drop_last and len(self.dataset) % self.num_tasks:
            # Split to nearest available length that is evenly divisible.
            # This is to ensure each task receives the same amount of data when
            # using this sampler.
            self.num_samples = len(self.dataset) // self.num_tasks
        else:
            self.num_samples = math.ceil(len(self.dataset) / self.num_tasks)
        self.total_size = self.num_samples * self.num_tasks
        self.shuffle = shuffle
        self.seed = seed
        self.indices = list(range(self.dataset_len))
        if not self.drop_last:
            # add extra samples to make it evenly divisible across tasks
            padding_indices_size = self.total_size - self.dataset_len
            # choose padding indices at random to reduce the chance of
            # reusing samples.
            random.seed(self.seed)
            padding_indices = random.sample(self.indices, padding_indices_size)
            self.indices += padding_indices
        else:
            # remove tail of data to make it evenly divisible.
            self.indices = self.indices[: self.total_size]
        assert len(self.indices) == self.total_size, (
            f"Total `indices` after dropping/padding indices must be equal "
            f"to `total_size` of the dataset. Received total indices: "
            f"`{len(self.indices)}` and total size is: `{self.total_size}`."
        )

    def __iter__(self):
        if self.shuffle:
            random.seed(self.seed)
            random.shuffle(self.indices)

        # subsample
        indices = self.indices[self.task_id : self.total_size : self.num_tasks]
        assert len(indices) == self.num_samples, (
            f"Total `indices` for tasks must be equal to `num_samples` in a "
            f"task. Received total indices: `{len(indices)}` and samples in "
            f"task are: `{self.num_samples}`."
        )

        yield from indices

    def __len__(self):
        return self.num_samples


[docs]def check_sharding_sanity(
    examples_per_file,
    batch_size,
    num_workers,
    drop_last,
):
    """Checks if with the given sharding, at least one batch is generated.

    Note that this method is operating based on how `shard_and_shuffle_data` is
    sharding the data across workers.

    :param list examples_per_file: Total examples per file for this task.
    :param int batch_size: Batch size of the model.
    :param int num_workers: Number of workers to use in the dataloader.
    :param bool drop_last: Boolean indicating whether the last incomplete batch
        of the dataloader is dropped.
    :raises ValueError: If no batches are generated with the given sharding.
    """
    if drop_last is False:
        return

    if num_workers == 0:
        total_samples = sum(examples_per_file)
        if total_samples < batch_size:
            raise ValueError(
                f"Task {task_id()} only generates {total_samples}, which "
                f"is fewer than a full batch of size {batch_size}. "
            )
        return

    examples_per_worker = [0] * num_workers
    for file_idx, examples_in_file in enumerate(examples_per_file):
        worker_id = file_idx % num_workers
        examples_per_worker[worker_id] += examples_in_file

    max_examples = max(examples_per_worker)

    if max_examples < batch_size:
        raise ValueError(
            f"Maximum number of samples generated in dataloader workers of "
            f"task {task_id()} is {max_examples}. Since {max_examples} is less "
            f"than batch size {batch_size} and `drop_last` is True, this task "
            f"will end up not producing any samples. Please specify a fewer "
            f"number of workers or tasks."
        )


[docs]def shard_list_contiguous(input_list, worker_id, num_workers):
    """
    Shards a list by splitting it into `num_workers` contiguous segments.
    Only the `worker_id`th shard is returned. If the length of the list is
    not divisible by the number of workers, the last worker will be assigned
    all remainder elements.

    Args:
        input_list (list): list to shard into contiguous segments
        worker_id (int): index of shard to return
        num_workers (int): number of shards to create

    Returns:
        A sublist of contiguous elements (`worker_id`'s shard)
    """

    assert num_workers <= len(input_list), (
        f"Number of processes should be less than number of files, "
        f"Got `num_workers` equal to {num_workers} and `num_files` equal to {len(input_list)}."
    )

    per_worker_num_files = len(input_list) // num_workers
    if worker_id < num_workers - 1:
        output_list = input_list[
            (worker_id * per_worker_num_files) : (
                (worker_id + 1) * per_worker_num_files
            )
        ]
    else:
        output_list = input_list[(worker_id * per_worker_num_files) :]
    return output_list


[docs]def shard_list_interleaved(input_list, worker_id, num_workers):
    """
    Shards a list by assigning consecutive elements to alternating workers
    (i.e. interleaving). If the length of the list is not divisible by the
    number of workers, the remainder elements are spread across a subset
    of the workers such that each worker in the subset receives 1 extra
    element.

    Args:
        input_list (list): list to shard in an interleaved fashion
        worker_id (int): index of shard to return
        num_workers (int): number of shards to create

    Returns:
        `worker_id`'s shard (a subset of `input_list`).
    """

    output_for_cur_worker = []

    if num_workers != 0:
        assert num_workers <= len(input_list), (
            f"Number of processes should be less than number of files, "
            f"Got `num_workers` equal to {num_workers} and `num_files` equal to {len(input_list)}."
        )

        # Gather files for the input worker based in the file index and
        # number of workers.
        for index, elm in enumerate(input_list):
            if index % num_workers == worker_id:
                output_for_cur_worker.append(elm)
    else:
        output_for_cur_worker = input_list

    return output_for_cur_worker


[docs]def shard_list_of_chunks_contiguous(
    input_list_of_chunks, worker_id, num_workers
):
    """
    Shards a list of chunks by distributing contiguous segments of each chunk
    across shards. If the chunk's length is not divisible by the
    number of workers, the remainder elements are spread across a subset
    of the workers such that each worker in the subset receives 1 extra
    element.

    Args:
        input_list (list of tuples): list of chunks to shard. List should be of format
            `[... (chunk_i, length_of_chunk_i), ...]`
        worker_id (int): index of shard to return
        num_workers (int): number of shards to create

    Returns:
        `worker_id`'s shard: a list of the same length as `input_list` of the
        format: `[... (chunk_i, shard_start_index_i, shard_length_i), ...]`
    """
    output_for_cur_worker = []
    for elm, chunk_length in input_list_of_chunks:
        # Try to evenly distribute chunk_length between workers
        chunk_length_per_worker = [(chunk_length // num_workers)] * num_workers
        for i in range(chunk_length % num_workers):
            chunk_length_per_worker[i] += 1

        assert sum(chunk_length_per_worker) == chunk_length

        output_for_cur_worker.append(
            (
                elm,
                sum(chunk_length_per_worker[:worker_id])
                if worker_id > 0
                else 0,  # Start index
                chunk_length_per_worker[worker_id],  # Length of data chunk
            )
        )
    return output_for_cur_worker


[docs]class SubsetSequentialSampler(torch.utils.data.Sampler[int]):
    r"""Samples elements sequentially, starting from given `start_index`,
        always in the same order.
    Args:
        data_source (Dataset): dataset to sample from
        start_index (int): index where sampling starts from
    """
    data_source: Sized
    start_index: int

[docs]    def __init__(self, data_source: Sized, start_index: int) -> None:
        self.data_source = data_source
        self.start_index = start_index

    def __iter__(self) -> Iterator[int]:
        return iter(range(self.start_index, len(self.data_source)))

    def __len__(self) -> int:
        return len(self.data_source)