Source code for cerebras.modelzoo.data.vision.classification.data.smallnorb

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Any, Literal, Optional

import numpy as np
from PIL import Image
from pydantic import Field
from torchvision.datasets.utils import verify_str_arg
from torchvision.datasets.vision import VisionDataset

from cerebras.modelzoo.data.vision.classification.dataset_factory import (
    VisionClassificationProcessor,
    VisionClassificationProcessorConfig,
)


[docs]class SmallNORB(VisionDataset):
    """
    This database is intended for experiments in 3D object recognition from shape.
    It contains images of 50 toys belonging to 5 generic categories: four-legged
    animals, human figures, airplanes, trucks, and cars. The objects were imaged
    by two cameras under 6 lighting conditions, 9 elevations (30 to 70 degrees
    every 5 degrees), and 18 azimuths (0 to 340 every 20 degrees).

    The training set is composed of 5 instances of each category (instances 4,
    6, 7, 8 and 9), and the test set of the remaining 5 instances (instances 0,
    1, 2, 3, and 5).
    """

    _file_dict = {
        "train": {
            "dat": "smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat",
            "cat": "smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat",
            "info": "smallnorb-5x46789x9x18x6x2x96x96-training-info.mat",
        },
        "test": {
            "dat": "smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat",
            "cat": "smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat",
            "info": "smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat",
        },
    }

    def __init__(
        self,
        root,
        split="train",
        task=None,
        transform=None,
        target_transform=None,
    ):
        super().__init__(
            os.path.join(root, "smallnorb"),
            transform=transform,
            target_transform=target_transform,
        )
        self.split = verify_str_arg(split, "split", ("train", "test"))
        if not os.path.exists(self.root):
            raise RuntimeError(
                "Dataset not found. Download from "
                "https://cs.nyu.edu/~ylclab/data/norb-v1.0-small/"
            )
        path_dat = os.path.join(self.root, self._file_dict[split]["dat"])
        path_cat = os.path.join(self.root, self._file_dict[split]["cat"])
        path_info = os.path.join(self.root, self._file_dict[split]["info"])

        dat_arr, cat_arr, info_arr = load_chunk(path_dat, path_cat, path_info)
        self.images = []
        self.targets = []
        for image, category, info_vec in zip(dat_arr, cat_arr, info_arr):
            self.images.append(
                {
                    "image": image[0],
                    "image2": image[1],
                }
            )
            record = {
                "label_category": category,
                "instance": info_vec[0],
                "label_elevation": info_vec[1],
                "label_azimuth": info_vec[2],
                "label_lighting": info_vec[3],
            }
            if task is None:
                self.targets.append(record)
            else:
                self.targets.append(record[task])

    def __getitem__(self, index):
        img = np.tile(self.images[index]["image"], (1, 1, 3))
        img = Image.fromarray(img.astype('uint8'), 'RGB')
        target = self.targets[index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
        return len(self.images)


[docs]def load_chunk(dat_path, cat_path, info_path):
    dat_array = read_binary_matrix(dat_path)
    # Even if the image is grayscale, still need extra channel dimension to be
    # compatible with PIL.Image
    dat_array = np.expand_dims(dat_array, -1)
    cat_array = read_binary_matrix(cat_path)

    info_array = read_binary_matrix(info_path)
    info_array = np.copy(info_array)  # Make read-only buffer array writable.
    # Azimuth values are 0, 2, 4, .., 34. We divide by 2 to get proper labels.
    info_array[:, 2] = info_array[:, 2] / 2

    return dat_array, cat_array, info_array


[docs]def read_binary_matrix(filename):
    """
    Reads and returns binary formatted matrix stored in filename.
    The file format is described on the dataset page:
    https://cs.nyu.edu/~ylclab/data/norb-v1.0-small/

    struct header {
        int magic; // 4 bytes
        int ndim; // 4 bytes, little endian
        int dim[3];
    };
    """
    with open(filename, "rb") as f:
        s = f.read()
        # Data is stored in little-endian byte order.
        int32_dtype = np.dtype("int32").newbyteorder("<")

        # The first 4 bytes contain a magic code that specifies the data type.
        magic = int(np.frombuffer(s, dtype=int32_dtype, count=1))

        if magic == 507333717:
            data_dtype = np.dtype("uint8")  # uint8 does not have a byte order.
        elif magic == 507333716:
            data_dtype = np.dtype("int32").newbyteorder("<")
        else:
            raise ValueError("Invalid magic value for data type!")

        # The second 4 bytes contain an int32 with the number of dimensions
        # of the stored array.
        ndim = int(np.frombuffer(s, dtype=int32_dtype, count=1, offset=4))

        # The next ndim x 4 bytes contain the shape of the array in int32.
        dims = np.frombuffer(s, dtype=int32_dtype, count=ndim, offset=8)

        # If the array has less than three dimensions, three int32 are still
        # used to save the shape info (remaining int32 are simply set to 1).
        # The shape info hence uses max(3, ndim) bytes.
        bytes_used_for_shape_info = max(3, ndim) * 4

        # The remaining bytes are the array.
        data = np.frombuffer(
            s, dtype=data_dtype, offset=8 + bytes_used_for_shape_info
        )

    return data.reshape(tuple(dims))


[docs]class SmallNORBProcessorConfig(VisionClassificationProcessorConfig):
    data_processor: Literal["SmallNORBProcessor"]

    use_worker_cache: bool = ...

    split: Literal["train", "test"] = "train"
    "Dataset split."

    num_classes: Optional[Any] = Field(None, deprecated=True)


[docs]class SmallNORBProcessor(VisionClassificationProcessor):
    def __init__(self, config: SmallNORBProcessorConfig):
        super().__init__(config)
        self.split = config.split
        self.shuffle = self.shuffle and (self.split == "train")
        self.num_classes = 5

    def create_dataset(self):
        use_training_transforms = self.split == "train"
        transform, target_transform = self.process_transform(
            use_training_transforms
        )
        dataset = SmallNORB(
            root=self.data_dir,
            split=self.split,
            transform=transform,
            target_transform=target_transform,
        )
        return dataset