Source code for datadings.sets.ImageNet21k_write

"""Create ImageNet21k winter release data set files.

This tool will look for the following files in the input directory:

- winter21_whole.tar.gz

See also:
    http://image-net.org/download
    https://github.com/Alibaba-MIIL/ImageNet21K

Note:
    Registration is required to download this dataset.
    Please visit the website to download it.
    If you experience issues downloading you may consider using bittorrent:
    https://academictorrents.com/details/8ec0d8df0fbb507594557bce993920442f4f6477

Important:
    For performance reasons samples are read in same order as they are stored
    in the source tar files. It is recommended to use the datadings-shuffle
    command to create a shuffled copy.
"""
import tarfile
import itertools as it
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool

from ..tools import document_keys
from ..tools import yield_process
from ..writer import FileWriter
from . import ImageNet21kData
from .ILSVRC2012_write import verify_image
from .ImageNet21k_synsets import SYNSETS
from .ImageNet21k_synsets import SYNSET_TREE_LIST
from .ImageNet21k_synsets import NUM_TRAIN_SAMPLES
from .ImageNet21k_synsets import NUM_VAL_SAMPLES
from .ImageNet21k_synsets import VAL_SAMPLES_PER_SYNSET


__doc__ += document_keys(ImageNet21kData)


FILES = {
    'tree': {
        'url': "https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/" \
                + "ImageNet_21K_P/resources/winter21/imagenet21k_miil_tree.pth",
        'path': 'imagenet21k_miil_tree.pth',
        'md5': 'ed3a7de5b90ace4999a99fca2a129d74',
    },
    'data': {
        'url': None,
        'path': 'winter21_whole.tar.gz',
        'md5': 'ab313ce03179fd803a401b02c651c0a2',
    }
}


[docs]def yield_samples(infile):
    # open the dataset file in streaming mode (r|gz)
    with tarfile.open(infile, mode='r|gz') as tar:
        for synset in tar:
            name = Path(synset.name).stem
            if synset.isfile() and name in SYNSETS:
                label = SYNSETS[name]
                label_tree = SYNSET_TREE_LIST[label]
                # use streaming mode (r|), since the parent file is not seekable
                synset_tar = tarfile.open(fileobj=tar.extractfile(synset), mode='r|')
                # sort images by name, as would be done by ls/glob
                # this ensures the first 50 images used for the validation set
                # are the same as in the Alibaba preprocessing script:
                # https://github.com/Alibaba-MIIL/ImageNet21K/blob/653ad536fde814e4cc7d0e19a48c8389e4ac2107/dataset_preprocessing/processing_script.sh#L51
                images = iter(sorted(
                    (info.name, synset_tar.extractfile(info).read())
                    for info in synset_tar
                ))
                val_images = it.islice(images, VAL_SAMPLES_PER_SYNSET)
                for name, data in val_images:
                    yield 'val', name, data, label, label_tree
                for name, data in images:
                    yield 'train', name, data, label, label_tree


[docs]def write_sets(files, outdir, args):
    gen = yield_process(yield_samples(files['data']['path']))

    def __verify_inner(item):
        split, key, data, label, label_tree = item
        data = verify_image(data, args.compress, colorsubsampling=args.subsampling)
        return split, ImageNet21kData(key, data, label, label_tree)

    trainfile = outdir / 'train.msgpack'
    valfile = outdir / 'val.msgpack'

    pool = ThreadPool(args.threads)
    train_writer = FileWriter(
        trainfile,
        total=NUM_TRAIN_SAMPLES,
        overwrite=args.no_confirm,
    )
    val_writer = FileWriter(
        valfile,
        total=NUM_VAL_SAMPLES,
        overwrite=args.no_confirm,
    )
    with train_writer, val_writer:
        for split, sample in pool.imap_unordered(__verify_inner, gen):
            if sample['image'] is None:
                print(f"{split} sample {sample['key']} failed verification")
                continue
            if split == 'train':
                train_writer.write(sample)
            elif split == 'val':
                val_writer.write(sample)
            else:
                raise ValueError(f'unknown split {split!r}')


[docs]def main():
    from ..tools.argparse import make_parser
    from ..tools.argparse import argument_threads
    from ..tools import prepare_indir

    from .ILSVRC2012_write import argument_compress
    from .ILSVRC2012_write import argument_subsampling

    parser = make_parser(__doc__, shuffle=False)
    argument_threads(parser, default=1)
    argument_compress(parser)
    argument_subsampling(parser)
    args = parser.parse_args()
    outdir = Path(args.outdir or args.indir)

    files = prepare_indir(FILES, args)

    write_sets(files, outdir, args)


if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        pass
    finally:
        print()