Source code for datadings.sets.Places365_write

"""Create Places365 data set files.
You can choose between any combination of

- high or low resolution images and
- standard or challenge (extended) training data set

This tool will look for the following files in the input directory
depending on the chosen version and download them if necessary:

- standard (large)

    - train_large_places365standard.tar (105 GB)
    - val_large.tar (2.1 GB)
    - test_large.tar (19 GB)
- standard (small)

    - train_256_places365standard.tar (24 GB)
    - val_256.tar (501 MB)
    - test_256.tar (4.4 GB)
- challenge (large)

    - train_large_places365challenge.tar (476 GB)
    - val_large.tar (2.1 GB)
    - test_large.tar (19 GB)
- challenge (small)

    - train_256_places365challenge.tar (108 GB)
    - val_256.tar (501 MB)
    - test_256.tar (4.4 GB)

See also:
    http://places2.csail.mit.edu/index.html

Important:
    For performance reasons shuffling is not available. It is recommended
    to use the datadings-shuffle command to create a shuffled copy.
"""
import os
import os.path as pt
import tarfile
import io

from ..tools import download_files_if_not_found
from ..tools import verify_files
from ..tools import yield_threaded
from ..writer import FileWriter
from . import ImageClassificationData
from . import ImageData
from .Places365 import CLASS_TO_ID
from ..tools import document_keys


__doc__ += document_keys(ImageClassificationData)


READ_SIZE = 4 * 1024 * 1024


BASE_URL = "http://data.csail.mit.edu/places/places365/"


[docs]def file_spec(path, md5, total=None):
    spec = {"url": BASE_URL + path, "path": path, "md5": md5}
    if total is not None:
        spec["total"] = total
    return spec


FILES_META = {
    "meta": file_spec("filelist_places365-standard.tar",
                      "35a0585fee1fa656440f3ab298f8479c"),
}


FILES_LARGE_VAL_TEST = {
    "validation": file_spec("val_large.tar",
                            "9b71c4993ad89d2d8bcbdc4aef38042f", 36500),
    "testing": file_spec("test_large.tar",
                         "41a4b6b724b1d2cd862fb3871ed59913", 328500),
    **FILES_META
}


FILES_256_VAL_TEST = {
    "validation": file_spec("val_256.tar",
                            "e27b17d8d44f4af9a78502beb927f808", 36500),
    "testing": file_spec("test_256.tar",
                         "f532f6ad7b582262a2ec8009075e186b", 328500),
    **FILES_META
}


FILES_LARGE_STANDARD = {
    "training": file_spec("train_large_places365standard.tar",
                          "67e186b496a84c929568076ed01a8aa1", 1803460),
    **FILES_LARGE_VAL_TEST
}


FILES_LARGE_CHALLENGE = {
    "training": file_spec("train_large_places365challenge.tar",
                          "605f18e68e510c82b958664ea134545f", 8000000),
    **FILES_LARGE_VAL_TEST
}


FILES_256_STANDARD = {
    "training": file_spec("train_256_places365standard.tar",
                          "53ca1c756c3d1e7809517cc47c5561c5", 1803460),
    **FILES_256_VAL_TEST
}


FILES_256_CHALLENGE = {
    "training": file_spec("train_256_places365challenge.tar",
                          "741915038a5e3471ec7332404dfb64ef", 8000000),
    **FILES_256_VAL_TEST
}


# (large/256, standard/challenge)
FILES = {
    (False, False): FILES_LARGE_STANDARD,
    (False, True): FILES_LARGE_CHALLENGE,
    (True, False): FILES_256_STANDARD,
    (True, True): FILES_256_CHALLENGE,
}


[docs]def get_files(challenge, low_res):
    return FILES[(low_res, challenge)]


def _write_set(generator, data_cls, out_path, total, overwrite):
    try:
        with FileWriter(out_path, total=total, overwrite=overwrite) as writer:
            for member, data, data_args in generator:
                writer.write(data_cls(
                    member.name,
                    data,
                    *data_args,
                ))
    except FileExistsError:
        pass


def _yield_from_training_folder_structure(data_tar):
    for member in data_tar:
        if not member.isfile() or not member.name.endswith(".jpg"):
            continue
        label = "/".join(member.name.split(os.sep)[2:-1])
        data = data_tar.extractfile(member).read()
        yield member, data, (CLASS_TO_ID[label],)


def _yield_from_meta_tar_files_list(data_tar, meta_tar, files_list_fn):
    fd = meta_tar.extractfile(files_list_fn)
    classes = {}
    for line in io.TextIOWrapper(fd, encoding="utf-8").readlines():
        try:
            image_fn, class_id = line.split()
            classes[image_fn] = int(class_id),  # put tuples for later
        except ValueError:
            break  # unlabeled data
    for member in data_tar:
        if not member.isfile() or not member.name.endswith(".jpg"):
            continue
        data = data_tar.extractfile(member).read()
        # either label in tuple or empty tuple
        label = classes.get(pt.basename(member.name), ())
        yield member, data, label


def _write_training_set(indir, outdir, args):
    files = get_files(args.challenge, args.low_res)["training"]
    tar_path = pt.join(indir, files["path"])
    out_path = pt.join(outdir, "training.msgpack")
    with tarfile.open(tar_path, "r", bufsize=READ_SIZE) as tar:
        gen = yield_threaded(_yield_from_training_folder_structure(tar))
        _write_set(gen, ImageClassificationData, out_path, files["total"],
                   args.no_confirm)


def _write_validation_set(indir, outdir, meta_tar, args):
    files = get_files(args.challenge, args.low_res)["validation"]
    tar_path = pt.join(indir, files["path"])
    out_path = pt.join(outdir, "validation.msgpack")
    with tarfile.open(tar_path, "r", bufsize=READ_SIZE) as data_tar:
        gen = yield_threaded(_yield_from_meta_tar_files_list(
            data_tar, meta_tar, "places365_val.txt"
        ))
        _write_set(gen, ImageClassificationData, out_path, files["total"],
                   args.no_confirm)


def _write_testing_set(indir, outdir, meta_tar, args):
    files = get_files(args.challenge, args.low_res)["testing"]
    tar_path = pt.join(indir, files["path"])
    out_path = pt.join(outdir, "testing.msgpack")
    with tarfile.open(tar_path, "r", bufsize=READ_SIZE) as data_tar:
        gen = yield_threaded(_yield_from_meta_tar_files_list(
            data_tar, meta_tar, "places365_test.txt"
        ))
        _write_set(gen, ImageData, out_path, files["total"],
                   args.no_confirm)


[docs]def write_sets(indir, outdir, args):
    files = get_files(args.challenge, args.low_res)
    download_files_if_not_found(files, indir)
    if not args.skip_verification:
        verify_files(files, indir)
    meta_tar_path = pt.join(indir, files["meta"]["path"])
    _write_training_set(indir, outdir, args)
    with tarfile.open(meta_tar_path, "r", bufsize=READ_SIZE) as meta_tar:
        _write_validation_set(indir, outdir, meta_tar, args)
        _write_testing_set(indir, outdir, meta_tar, args)


[docs]def main():
    from ..tools.argparse import make_parser

    parser = make_parser(__doc__, shuffle=False)
    parser.add_argument(
        "-l", "--low-res",
        action="store_true",
        help="Download the resized and cropped images (256x256). "
             "Default are images with minimum dimension of 512 and preserved "
             "aspect ratio."
    )
    parser.add_argument(
        "-c", "--challenge",
        action="store_true",
        help="Download the extended challenge training dataset. Validation and "
             "testing are the same."
    )
    args = parser.parse_args()
    outdir = args.outdir or args.indir
    write_sets(args.indir, outdir, args)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        pass
    finally:
        print()