Source code for cerebras.modelzoo.data_preparation.multimodal.llava.preprocess_dataset

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import glob
import io
import json
import logging
import os
import random
import warnings

import pandas as pd
from PIL import Image


[docs]def parse_arguments():
    parser = argparse.ArgumentParser("Pre-process LLaVa datasets")

    subparsers = parser.add_subparsers(dest="dataset")

    # subparser for ai2d datset
    preprocess_ai2d = subparsers.add_parser(
        "ai2d", help="Pre-process AI2D dataset"
    )
    preprocess_ai2d.add_argument(
        "--question_dir",
        type=str,
        required=True,
        help="Path to the AI2D question directory, which contains json files describing the question and answer corresponding to an image.",
    )
    preprocess_ai2d.add_argument(
        "--output_jsonl_dir",
        type=str,
        required=True,
        help="Folder to write the AI2D output jsonl files, which is in LLaVa format describing the image and associated question and answer.",
    )

    # subparser for arxivcap dataset
    preprocess_arxivcap = subparsers.add_parser(
        "arxivcap", help="Pre-process ArxivCAP dataset"
    )
    preprocess_arxivcap.add_argument(
        "--input_dir",
        type=str,
        required=True,
        help="Input directory of ArxivCAP dataset parquet files.",
    )
    preprocess_arxivcap.add_argument(
        "--output_jsonl_dir",
        type=str,
        required=True,
        help="Output directory of ArxivCAP processed jsonl files with LLaVa jsonl format.",
    )
    preprocess_arxivcap.add_argument(
        "--output_parquet_dir",
        type=str,
        required=True,
        help="Output directory of ArxivCAP processed parquet files.",
    )
    preprocess_arxivcap.add_argument(
        "--parquet_range",
        type=int,
        required=True,
        nargs="+",
        help="Range of ArxivCAP parquet files to be selected.",
    )
    preprocess_arxivcap.add_argument(
        "--output_image_dir",
        type=str,
        required=True,
        help="Directory of ArxivCAP image files.",
    )
    preprocess_arxivcap.add_argument(
        "--image_prefix",
        type=str,
        required=True,
        help="Relative path prefix for ArxivCAP image files.",
    )

    # subparser for arxivqa
    preprocess_arxivqa = subparsers.add_parser(
        "arxivqa", help="Pre-process ArxivQA dataset"
    )
    preprocess_arxivqa.add_argument(
        "--input_file",
        type=str,
        required=True,
        help="Path to the ArxivQA question file, which contains the question and answer corresponding to an image.",
    )
    preprocess_arxivqa.add_argument(
        "--output_jsonl_dir",
        type=str,
        required=True,
        help="Folder to write the ArxivQA output jsonl files, which is in LLaVa format describing the image and associated question and answer.",
    )

    # subparser for chartqa
    preprocess_chartqa = subparsers.add_parser(
        "chartqa", help="Pre-process ChartQA dataset"
    )
    preprocess_chartqa.add_argument(
        "--dataset_folder",
        type=str,
        required=True,
        help="Path to the ChartQA dataset folder with the data split folders.",
    )

    # subparser for sp_docvqa
    preprocess_sp_docvqa = subparsers.add_parser(
        "sp_docvqa", help="Pre-process SP-DocVQA dataset"
    )
    preprocess_sp_docvqa.add_argument(
        "--dataset_folder",
        type=str,
        required=True,
        help="Path to the SP-DocVQA dataset folder with the data files.",
    )

    # subparser for infographics_docvqa
    preprocess_infographics_docvqa = subparsers.add_parser(
        "infographics_docvqa", help="Pre-process Infographics-DocVQA dataset"
    )
    preprocess_infographics_docvqa.add_argument(
        "--dataset_folder",
        type=str,
        required=True,
        help="Path to the Inforgraphics-DocVQA dataset folder with the data files.",
    )

    # subparser for dvqa
    preprocess_dvqa = subparsers.add_parser(
        "dvqa", help="Pre-process DVQA dataset"
    )
    preprocess_dvqa.add_argument(
        "--dataset_folder",
        type=str,
        required=True,
        help="Path to the DVQA dataset folder with the data files.",
    )

    # subparser for synthdog_en
    preprocess_synthdog_en = subparsers.add_parser(
        "synthdog_en", help="Pre-process Synthdog_EN dataset"
    )
    preprocess_synthdog_en.add_argument(
        "--input_dir",
        type=str,
        required=True,
        help="Input directory of Synthdog-EN dataset parquet files.",
    )
    preprocess_synthdog_en.add_argument(
        "--output_jsonl_dir",
        type=str,
        required=True,
        help="Output directory of Synthdog-EN processed json files with LLaVa jsonl format.",
    )

    preprocess_synthdog_en.add_argument(
        "--output_parquet_dir",
        type=str,
        required=True,
        help="Output directory of Synthdog-EN processed parquet files.",
    )
    preprocess_synthdog_en.add_argument(
        "--parquet_range",
        type=int,
        required=True,
        nargs="+",
        help="Range of Synthdog-EN parquet files to be selected.",
    )
    preprocess_synthdog_en.add_argument(
        "--output_image_dir",
        type=str,
        required=True,
        help="Directory of Synthdog-EN image files.",
    )
    preprocess_synthdog_en.add_argument(
        "--image_prefix",
        type=str,
        required=True,
        help="Relative path prefix for Synthdog-EN image files.",
    )

    # subparser for simply converting from json to jsonl
    preprocess_json_to_jsonl = subparsers.add_parser(
        "convert_json2jsonl", help="Pre-process json files to jsonl files"
    )
    preprocess_json_to_jsonl.add_argument(
        "--input_dir",
        type=str,
        required=True,
        help="Path to a folder of json files that need to be converted to jsonl format.",
    )

    # returned parsed arguments
    args = parser.parse_args()
    return args


[docs]def convert_json_to_jsonl(new_data):
    output_folder = f"{input_folder}_to_jsonl"

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    json_files = list(glob.glob(os.path.join(input_folder, "*.json")))

    for jfile in json_files:
        out_file = os.path.join(
            output_folder, os.path.basename(jfile).replace("json", "jsonl")
        )
        logging.info(f"Processing {jfile} -> {out_file}")

        with open(jfile, "r") as fh:
            data = json.load(fh)

        # Convert and save to JSONL
        with open(out_file, "w") as jsonl_file:
            for entry in data:
                jsonl_file.write(json.dumps(entry) + "\n")

    logging.info(f"--- jsonl files saved at {output_folder} ---")


[docs]def process_ai2d(args):
    question_dir = args.question_dir
    output_jsonl_dir = args.output_jsonl_dir

    if not os.path.exists(output_jsonl_dir):
        os.makedirs(output_jsonl_dir, exist_ok=False)

    input_file_list = os.listdir(question_dir)

    label_list = ["a", "b", "c", "d"]

    def process_options(options):
        ret_options = []
        for idx in range(len(options)):
            ret_options.append(f"{label_list[idx]}) {options[idx]}")
        return ret_options

    def get_user_string(question, options):
        option_str = " ".join(options)
        return "<image>\n" + question + " " + option_str

    new_data = []
    for input_file in input_file_list:
        filename = os.path.join(question_dir, input_file)
        with open(filename, "r") as json_file:
            data = json.load(json_file)
        image_name = f"ai2d/images/{data['imageName']}"
        for quest in data["questions"].keys():
            options = process_options(data["questions"][quest]["answerTexts"])
            new_d = {
                "id": data["questions"][quest]["questionId"],
                "image": image_name,
                "conversations": [
                    {
                        "from": "human",
                        "value": get_user_string(quest, options),
                    },
                    {
                        "from": "gpt",
                        "value": options[
                            data["questions"][quest]["correctAnswer"]
                        ],
                    },
                ],
            }
            new_data.append(new_d)

    out_file = os.path.join(output_jsonl_dir, "ai2d_llava.jsonl")
    with open(out_file, "w") as jsonl_file:
        for entry in new_data:
            jsonl_file.write(json.dumps(entry) + "\n")
    logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")


[docs]def process_arxivcap(args):
    # Only handling single-figure captioning for now
    question_dict = {
        "Single-Figure Captioning": "Create a caption for the provided figure.",
        "Multiple-Figure Captioning": "Create a caption for the provided figures.",
        "Title Generation": "According to the figures and captions, generate a title for this paper. Title:",
        "Contextualized Captioning": None,  # depends on the figure type
    }

    def preprocess_parquet_to_llava(
        in_filename,
        out_jsonl_filename,
        out_parquet_fname,
        image_foldername,
        img_relpath_prefix,
    ):
        logging.info(f"preprocessing: {in_filename}")

        data = pd.read_parquet(in_filename)

        # write id
        data["id"] = data.index
        data["id"] = data["id"].apply(lambda x: "{:07d}".format(x))

        def convert_to_llava(caption_images):
            img_with_subcaption, img_single = 0, 0
            llava_samples = []
            for caption_img in caption_images:
                image_path = caption_img["cil_pairs"][0]["image_file"]
                if len(caption_img["cil_pairs"]) == 1:
                    image_filename = os.path.join(
                        img_relpath_prefix, image_path
                    )
                    caption = caption_img["caption"]
                    out = {
                        "id": image_path.split("/")[1][: -len(".jpg")],
                        "image": image_filename,
                        "conversations": [
                            {"from": "human", "value": None},
                            {"from": "gpt", "value": None},
                        ],
                    }
                    conversations = out["conversations"]
                    question = question_dict["Single-Figure Captioning"]
                    conversations[0]["value"] = f"<image>\n{question}"
                    conversations[1]["value"] = caption
                    llava_samples.append(out)
                    img_single += 1
                else:
                    for subcaption in caption_img["cil_pairs"]:
                        img_with_subcaption += 1

                if not os.path.exists(
                    os.path.join(image_foldername, image_path.split("/")[0])
                ):
                    os.makedirs(
                        os.path.join(image_foldername, image_path.split("/")[0])
                    )
                for img in caption_img["cil_pairs"]:
                    image_name = os.path.join(
                        image_foldername, img["image_file"]
                    )
                    image = Image.open(io.BytesIO(img["image"]["bytes"]))
                    image.save(image_name)

            return llava_samples

        data["llava"] = data.apply(
            lambda x: convert_to_llava(x.caption_images), axis=1
        )

        logging.info(f"Writing preprocessed parquet")
        data.to_parquet(out_parquet_fname, compression=None)

        with open(out_jsonl_filename, "w") as jsonl_file:
            for entry in data["llava"].tolist():
                jsonl_file.write(json.dumps(entry) + "\n")

    input_dir = args.input_dir
    parquet_range = args.parquet_range

    all_files = glob.glob(os.path.join(input_dir, "*.parquet"))
    all_files = sorted(
        all_files, key=lambda x: int(os.path.basename(x).split("_")[2])
    )

    def file_filter(x, range):
        bname = os.path.basename(x)
        f_month = int(bname.split("_")[2])
        if range[0] <= f_month < range[1]:
            return True
        else:
            False

    select_files = list(
        filter(lambda x: file_filter(x, parquet_range), all_files)
    )

    logging.info("selected_files:", len(select_files))

    if not os.path.exists(args.output_jsonl_dir):
        os.makedirs(args.output_jsonl_dir)

    if not os.path.exists(args.output_parquet_dir):
        os.makedirs(args.output_parquet_dir)

    for file in select_files:
        logging.info(f"---------- Parsing file: {file} ----------")
        output_jsonl_fname = os.path.basename(file).replace(
            ".parquet", ".jsonl"
        )
        out_jsonl_filename = os.path.join(
            args.output_jsonl_dir, output_jsonl_fname
        )
        out_parquet_fname = os.path.join(
            args.output_parquet_dir,
            output_jsonl_fname.replace("jsonl", "parquet"),
        )

        logging.info(f"in_filename: {file}")
        logging.info(f"out_jsonl_filename: {out_jsonl_filename}")
        logging.info(f"out_parquet_filename: {out_parquet_fname}")
        logging.info(f"image_foldername: {args.output_image_dir}")
        logging.info(f"img_relpath_prefix: {args.image_prefix}")

        preprocess_parquet_to_llava(
            in_filename=file,
            out_jsonl_filename=out_jsonl_filename,
            out_parquet_fname=out_parquet_fname,
            image_foldername=args.output_image_dir,
            img_relpath_prefix=args.image_prefix,
        )

    logging.info(f"--- jsonl files saved at {args.output_jsonl_dir} ---")


[docs]def process_arxivqa(args):
    input_file = args.input_file
    output_jsonl_dir = args.output_jsonl_dir

    if not os.path.exists(output_jsonl_dir):
        os.makedirs(output_jsonl_dir, exist_ok=False)

    # Load your JSONL file
    with open(input_file, "r") as jsonl_file:
        json_list = list(jsonl_file)

    def get_user_string(question, options):
        option_str = " ".join(options)
        return "<image>\n" + question + " " + option_str

    label_dict = {
        "A": 0,
        "B": 1,
        "C": 2,
        "D": 3,
        "E": 4,
        "F": 5,
        "G": 6,
        "H": 7,
        "I": 8,
    }

    def get_gpt_string(options, label, rationale):
        # No response
        if label == "" and rationale == "":
            return None
        # Label of type "[xxxxxx]"
        elif not (label[0] in label_dict.keys()):
            return label + " " + rationale
        # Label of type "A"
        else:
            if label[0] in label_dict.keys():
                # Apparently there are labels that are beyond options...
                if label_dict[label[0]] >= len(options):
                    return None
                else:
                    label_str = options[label_dict[label[0]]]
                return label_str + " " + rationale
            else:
                warnings.warn(
                    "This sample's label is not part of the label_dict. Ignoring this sample."
                )

    """
    # Note: some options don"t have A/B/C/D and some options has format A) or A.
    # Labels may or may not contain full option string and are not consistent.
    # As a cleanup step, we will replace the label with the full text of the option,
    # regardless of the format for the options.
    """
    new_data = []
    for idx, d in enumerate(json_list):
        d = json.loads(d)
        new_d = {
            "id": d["id"],
            "image": f"ArxivQA/{d['image']}",
            "conversations": [
                {
                    "from": "human",
                    "value": get_user_string(d["question"], d["options"]),
                },
                {
                    "from": "gpt",
                    "value": get_gpt_string(
                        d["options"], d["label"], d["rationale"]
                    ),
                },
            ],
        }
        if new_d["conversations"][1]["value"] is not None:
            new_data.append(new_d)

    out_file = os.path.join(output_jsonl_dir, "arxivqa_llava.jsonl")
    with open(out_file, "w") as jsonl_file:
        for entry in new_data:
            jsonl_file.write(json.dumps(entry) + "\n")
    logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")


[docs]def process_chartqa(args):
    def generate(split, subset):
        input_file = f"{args.dataset_folder}/{split}/{split}_{subset}.json"
        output_file = f"{args.dataset_folder}/{split}/{split}_{subset}_llava_jsonl/{split}_{subset}_llava.jsonl"
        output_jsonl_dir = os.path.dirname(output_file)

        if not os.path.exists(output_jsonl_dir):
            os.makedirs(output_jsonl_dir, exist_ok=False)

        # Load your JSON file
        with open(input_file, "r") as json_file:
            data = json.load(json_file)

        new_data = []
        for idx, d in enumerate(data):
            new_d = {
                "id": idx,
                "image": f"ChartQA_Dataset/{split}/png/{d['imgname']}",
                "conversations": [
                    {
                        "from": "human",
                        "value": f"<image>\n{d['query']}",
                    },
                    {
                        "from": "gpt",
                        "value": d["label"],
                    },
                ],
            }
            new_data.append(new_d)

        with open(output_file, "w") as jsonl_file:
            for entry in new_data:
                jsonl_file.write(json.dumps(entry) + "\n")
        logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")

    for split in ["train", "val", "test"]:
        for subset in ["human", "augmented"]:
            generate(split, subset)


[docs]def process_sp_docvqa(args):
    def generate(split):
        input_file = f"{args.dataset_folder}/{split}.json"
        output_file = (
            f"{args.dataset_folder}/{split}_llava_jsonl/{split}_llava.jsonl"
        )
        output_jsonl_dir = os.path.dirname(output_file)

        if not os.path.exists(output_jsonl_dir):
            os.makedirs(output_jsonl_dir, exist_ok=False)

        new_data = []
        with open(input_file, "r") as json_file:
            data = json.load(json_file)["data"]

        for quest in data:
            image_name = quest["image"].split("/")[-1]
            new_d = {
                "id": quest["questionId"],
                "image": f"DocVQA/sp_docvqa/images/{image_name}",
                "conversations": [
                    {
                        "from": "human",
                        "value": f"<image>\n{quest['question']}",
                    },
                    {
                        "from": "gpt",
                        # only use the first answer
                        "value": quest["answers"][0],
                    },
                ],
            }
            new_data.append(new_d)

        with open(output_file, "w") as jsonl_file:
            for entry in new_data:
                jsonl_file.write(json.dumps(entry) + "\n")
        logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")

    for split in ["train_v1.0_withQT", "val_v1.0_withQT"]:
        generate(split)


[docs]def process_infographics_docvqa(args):
    def generate(split):
        input_file = f"{args.dataset_folder}/infographicsVQA_{split}.json"
        output_file = f"{args.dataset_folder}/infographicsVQA_{split}_llava_jsonl/infographicsVQA_{split}_llava.jsonl"
        output_jsonl_dir = os.path.dirname(output_file)

        if not os.path.exists(output_jsonl_dir):
            os.makedirs(output_jsonl_dir, exist_ok=False)

        new_data = []
        with open(input_file, "r") as json_file:
            data = json.load(json_file)["data"]

        for quest in data:
            new_d = {
                "id": quest["questionId"],
                "image": f"DocVQA/Infographicsvqa/images/{quest['image_local_name']}",
                "conversations": [
                    {
                        "from": "human",
                        "value": f"<image>\n{quest['question']}",
                    },
                    {
                        "from": "gpt",
                        # only use the first answer
                        "value": quest["answers"][0],
                    },
                ],
            }
            new_data.append(new_d)

        with open(output_file, "w") as jsonl_file:
            for entry in new_data:
                jsonl_file.write(json.dumps(entry) + "\n")
        logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")

    for split in ["train_v1.0", "val_v1.0_withQT"]:
        generate(split)


[docs]def process_dvqa(args):
    # "train_qa" #"val_easy_qa" #"val_hard_qa"

    subset = "train_qa"

    input_file = f"{args.dataset_folder}/{subset}.json"
    output_file = (
        f"{args.dataset_folder}/{subset}_llava_jsonl/{subset}_llava.jsonl"
    )
    output_jsonl_dir = os.path.dirname(output_file)

    if not os.path.exists(output_jsonl_dir):
        os.makedirs(output_jsonl_dir, exist_ok=False)

    # Load your JSON file
    with open(input_file, "r") as json_file:
        data = json.load(json_file)

    new_data = []
    for idx, d in enumerate(data):
        new_d = {
            "id": d["question_id"],
            "image": f"DVQA/images/{d['image']}",
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{d['question']}",
                },
                {
                    "from": "gpt",
                    "value": d["answer"],
                },
            ],
        }
        new_data.append(new_d)

    with open(output_file, "w") as jsonl_file:
        for entry in new_data:
            jsonl_file.write(json.dumps(entry) + "\n")
    logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")


[docs]def process_synthdog_en(args):
    question_list = [
        "Describe the image concisely.",
        "Provide a brief description of the given image.",
        "Offer a succinct explanation of the picture presented.",
        "Summarize the visual content of the image.",
        "Give a short and clear explanation of the subsequent image.",
        "Share a concise interpretation of the image provided.",
        "Present a compact description of the photo's key features.",
        "Relay a brief, clear account of the picture shown.",
        "Render a clear and concise summary of the photo.",
        "Write a terse but informative summary of the picture.",
        "Create a compact narrative representing the image presented.",
    ]

    def preprocess_parquet_to_llava(
        in_filename,
        out_jsonl_filename,
        out_parquet_fname,
        image_foldername,
        img_relpath_prefix,
    ):
        logging.info(f"preprocessing: {in_filename}")

        data = pd.read_parquet(in_filename)

        # write id
        data["id"] = data.index
        data["id"] = data["id"].apply(lambda x: "{:07d}".format(x))

        def convert_to_llava(id, ground_truth):
            out = {
                "id": id,
                "image": os.path.join(img_relpath_prefix, f"{id}.png"),
                "conversations": [
                    {"from": "human", "value": None},
                    {"from": "gpt", "value": None},
                ],
            }
            ground_truth = eval(ground_truth)
            conversations = out["conversations"]
            question_idx = random.randint(0, len(question_list) - 1)
            question = question_list[question_idx]
            conversations[0]["value"] = f"<image>\n{question}"
            conversations[1]["value"] = ground_truth["gt_parse"][
                "text_sequence"
            ]
            return out

        def save_image(id, image):
            image = Image.open(io.BytesIO(image["bytes"]))
            p = os.path.join(image_foldername, f"{id}.png")
            image.save(p)

        data["llava"] = data.apply(
            lambda x: convert_to_llava(x.id, x.ground_truth), axis=1
        )

        logging.info(f"Writing preprocessed parquet")
        data.to_parquet(out_parquet_fname, compression=None)

        with open(out_jsonl_filename, "w") as jsonl_file:
            for entry in data["llava"].tolist():
                jsonl_file.write(json.dumps(entry) + "\n")

        logging.info(f"Saving images now")
        data.apply(lambda x: save_image(x.id, x.image), axis=1)
        logging.info(f"DONE: saving images")

    input_dir = args.input_dir
    parquet_range = args.parquet_range

    all_files = glob.glob(os.path.join(input_dir, "*.parquet"))
    all_files = sorted(
        all_files, key=lambda x: int(os.path.basename(x).split("-")[1])
    )

    def file_filter(x, range):
        bname = os.path.basename(x)
        fnum = int(bname.split("-")[1])
        if range[0] <= fnum < range[1]:
            return True
        else:
            False

    select_files = list(
        filter(lambda x: file_filter(x, parquet_range), all_files)
    )

    logging.info(f"selected_files: {select_files}")

    if not os.path.exists(args.output_jsonl_dir):
        os.makedirs(args.output_jsonl_dir)

    if not os.path.exists(args.output_parquet_dir):
        os.makedirs(args.output_parquet_dir)

    for file in select_files:
        logging.info(f"---------- Parsing file: {file} ----------")
        output_jsonl_fname = os.path.basename(file).replace(".parquet", ".json")
        out_jsonl_filename = os.path.join(
            args.output_jsonl_dir, output_jsonl_fname
        )
        out_parquet_fname = os.path.join(
            args.output_parquet_dir,
            output_jsonl_fname.replace("json", "parquet"),
        )

        splits = os.path.basename(file).split("-")
        image_subdir = os.path.join(args.output_image_dir, splits[0], splits[1])

        if not os.path.exists(image_subdir):
            os.makedirs(image_subdir)
        assert args.image_prefix in image_subdir
        image_prefix = os.path.join(args.image_prefix, splits[0], splits[1])

        logging.info(f"in_filename: {file}")
        logging.info(f"out_jsonl_filename: {out_jsonl_filename}")
        logging.info(f"out_parquet_filename: {out_parquet_fname}")
        logging.info(f"image_foldername: {image_subdir}")
        logging.info(f"img_relpath_prefix: {image_prefix}")

        preprocess_parquet_to_llava(
            in_filename=file,
            out_jsonl_filename=out_jsonl_filename,
            out_parquet_fname=out_parquet_fname,
            image_foldername=image_subdir,
            img_relpath_prefix=image_prefix,
        )

    logging.info(f"--- jsonl files saved at {args.output_jsonl_dir} ---")


if __name__ == "__main__":
    args = parse_arguments()

    if args.dataset == "ai2d":
        process_ai2d(args)
    elif args.dataset == "arxivcap":
        process_arxivcap(args)
    elif args.dataset == "arxivqa":
        process_arxivqa(args)
    elif args.dataset == "chartqa":
        process_chartqa(args)
    elif args.dataset == "sp_docvqa":
        process_sp_docvqa(args)
    elif args.dataset == "infographics_docvqa":
        process_infographics_docvqa(args)
    elif args.dataset == "dvqa":
        process_dvqa(args)
    elif args.dataset == "synthdog_en":
        process_synthdog_en(args)
    elif args.dataset == "convert_json2jsonl":
        convert_json_to_jsonl(args.input_dir)
    else:
        raise ValueError(
            "Dataset currently not supported. Feel free to adapt codebase to include your dataset."
        )