Source code for datadings.sets.YFCC100m

"""
The Yahoo Flickr Creative Commons 100 Million (YFCC100m) dataset.

Important:
    Only images are included.
    No videos or metadata.

See also:
    https://multimediacommons.wordpress.com/yfcc100m-core-dataset/

Warning:
    This code is intended to load a pre-release version of the
    YFCC100m dataset.
    Please complain if you want to use the release version available
    from amazon:
    https://multimediacommons.wordpress.com/yfcc100m-core-dataset/
"""

import os
import os.path as pt
import zipfile
import re
import io
from collections import defaultdict

import numpy as np
from simplejpeg import decode_jpeg
from PIL import Image

from ..reader import Reader
from ..tools.msgpack import unpack
from ..tools.msgpack import make_packer
from . import ImageData
from .YFCC100m_counts import FILE_COUNTS
from .YFCC100m_counts import FILES_TOTAL
from ..tools import document_keys
from ..tools.compression import open_comp


__doc__ += document_keys(ImageData)


ROOT = pt.abspath(pt.dirname(__file__))
REJECTED_PATH = pt.join(ROOT, 'YFCC100m_rejected_images.msgpack.xz')


[docs]def noop(data):
    return data


[docs]def decode_fast(data):
    try:
        # decode JPEGs at reduced scale for speedup
        return decode_jpeg(
            data,
            'gray',
            fastdct=True,
            fastupsample=True,
            min_height=1,
            min_width=1
        )
    except ValueError:
        # use pillow in case anything goes wrong
        bio = io.BytesIO(data)
        return np.array(Image.open(bio).convert('L'))


[docs]def validate_image(data):
    if len(data) < 2600 or len(data) == 9218:
        return None
    try:
        im = decode_fast(data)
        # if only small amounts of data, check for meaningful content,
        # i.e., at least 5% of all lines in image show some variance
        if len(data) < 20000 and np.percentile(im.var(0), 95) < 50:
            return None
        return data
    except (ValueError, IOError, OSError):
        return None


def _find_zip_key(rejected, zips, key):
    z, f = key.split(os.sep)
    try:
        zip_index = zips.index(z)
    except ValueError:
        raise IndexError('ZIP file {!r} not found'.format(z))

    partial_index = 0
    for z, count in FILE_COUNTS[:zip_index]:
        partial_index += count - len(rejected[z])

    return zip_index, f, partial_index


def _find_zip_index(rejects, index):
    total = FILES_TOTAL
    for z, _ in FILE_COUNTS:
        total -= len(rejects[z])
    if index < 0:
        index += total
    if index < 0 or index >= total:
        raise IndexError('index {} out of range for {} items'.format(
            index, total - 1
        ))
    partial_index = 0
    for i, (z, count) in enumerate(FILE_COUNTS):
        count -= len(rejects[z])
        if partial_index + count > index:
            return i, index - partial_index, partial_index
        partial_index += count


def _filter_zipinfo(infos):
    p = re.compile(r'/[0-9a-f]+$')
    return [info for info in infos if p.search(info.filename)]


def _find_member_image(members, rejected, start_image):
    if not start_image:
        return members
    for i, m in enumerate(members):
        if m.filename.split(os.sep)[1] == start_image:
            if i in rejected:
                raise IndexError(
                    '{!r} is on the rejected list'.format(m.filename)
                )
            return i
    raise IndexError('{!r} not found'.format(start_image))


def _find_member_index(rejected, start_index):
    for r in rejected:
        if start_index > r:
            start_index += 1
        else:
            break
    return start_index


def _find_start(
        path,
        rejected,
        start_key='',
        start_index=0
):
    if start_index and start_key:
        raise ValueError('cannot set both start_key and start_index')

    zips = [f for f, _ in FILE_COUNTS]
    # find out which zipfile to start from
    if start_index:
        zip_index, start_index, partial_index = _find_zip_index(rejected, start_index)
        start_image = ''
    elif start_key:
        zip_index, start_image, partial_index = _find_zip_key(rejected, zips, start_key)
    else:
        return zips, 0, 0

    z = zips[zip_index]
    r = rejected[z]
    with zipfile.ZipFile(pt.join(path, z) + '.zip') as imagezip:
        # z must be bytes so the set of rejected images is found in py3
        # filter out non-image members
        members = _filter_zipinfo(imagezip.infolist())
    if start_index:
        start_index = _find_member_index(r, start_index)
    elif start_image:
        start_index = _find_member_image(members, r, start_image)
    return zips[zip_index:], start_index, partial_index


def _yield_from_zips(
        path,
        zips,
        rejected,
        start_index,
        validator=noop,
):
    for z in zips:
        with zipfile.ZipFile(pt.join(path, z) + '.zip') as imagezip:
            r = rejected[z]
            # filter out non-image members
            members = _filter_zipinfo(imagezip.infolist())
            for i, m in enumerate(members[start_index:], start_index):
                if i in r:
                    continue
                f = m.filename
                yield validator(imagezip.read(f)), f, z, i
        start_index = 0


def _parse_rejected(path, rejected):
    try:
        with open_comp(path, "rb") as f:
            new_rejected = unpack(f)
        for z, r in new_rejected.items():
            rejected[z].update(r)
        return rejected
    except ValueError:
        with open_comp(path, "rt", encoding="utf-8") as f:
            for line in f:
                z, i = line.strip("\n").split(' ')
                rejected[z].add(int(i))
        return rejected


[docs]class DevNull(object):
[docs]    def read(self, *_):
        pass

[docs]    def write(self, *_):
        pass

[docs]    def close(self):
        pass


[docs]class YFCC100mReader(Reader):
    """
    Special reader for the YFCC100m dataset only.
    It reads images from 10000 ZIP files of roughly 10000 images
    each.

    One pass over the whole dataset was made to filter out irrelevant
    images if one of the following conditions is met:

    - Image is damaged/incomplete.
    - Less than 2600 bytes.
    - Exactly 9218 bytes - a placeholder image from Flickr.
    - Less than 20000 bytes and less than 5% of lines in the image
      have a variance less than 50.

    Which images are rejected is controlled by the files given as
    ``reject_file_paths``.
    Set this to None or empty list to iterate over the whole dataset.

    Parameters:
        image_packs_dir: Path to directory with image ZIP files.
        validator: Callable
                   ``validator(data: bytes) -> Union[bytes, None]``.
                   Validates images before they are returned.
                   Receives image data and returns data or ``None``.

    Warning:
        A validating reader cannot be copied and it is strongly
        discourages to copy readers with ``error_file`` paths.

    Warning:
        Methods``get``, ``slice``, ``find_index``, ``find_key``,
        ``seek_index``, and ``seek_key`` are considerably slower
        for this reader compared to others.
        Use iterators and large ``slice`` ranges instead.
    """
    _do_not_copy = ('_gen', '_error_file')

    def __init__(
            self,
            image_packs_dir,
            validator=noop,
            reject_file_paths=(REJECTED_PATH,),
            error_file=None,
            error_file_mode='a',
    ):
        super().__init__()
        self._path = image_packs_dir
        if not callable(validator):
            raise ValueError('validator must be callable, not %r'
                             % validator)
        self._validator = validator
        self._rejected = defaultdict(lambda: set())
        for path in reject_file_paths or ():
            self._rejected = _parse_rejected(path, self._rejected)
        self._packer = make_packer()
        self._error_file_args = error_file, error_file_mode
        self._error_file = None
        self.open_error_file_()
        self._gen = None
        self.seek_index(0)

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.__del__()

    def __del__(self):
        if hasattr(self, '_error_file'):
            self._error_file.close()

    def __len__(self):
        return FILES_TOTAL - sum(len(r) for r in self._rejected.values())

    def __contains__(self, key):
        try:
            self.find_index(key)
            return True
        except IndexError:
            return False

    def __copy__(self):
        if self._validator != noop:
            raise RuntimeError('cannot copy a validating reader')
        reader = super().__copy__()
        reader.open_error_file_()
        reader.seek_index(self._i)
        return reader

[docs]    def open_error_file_(self):
        error_file, error_file_mode = self._error_file_args
        if error_file is None:
            self._error_file = DevNull()
        else:
            self._error_file = open(error_file, error_file_mode)

    def _get_next_sample(self, gen):
        while True:
            sample, key, z, i = next(gen)
            if i in self._rejected[z]:
                continue
            if sample is None:
                self._rejected[z].add(i)
                self._error_file.write('%s %d\n' % (z, i))
            else:
                break
        return ImageData(key, sample)

[docs]    def next(self):
        return self._get_next_sample(self._gen)

    __next__ = next

[docs]    def rawnext(self):
        return self._packer.pack(self.next())

[docs]    def find_index(self, key):
        zips, start_index, index = _find_start(
            self._path, self._rejected, start_key=key
        )

        # count up to start_index in zip and all samples
        # that are not in rejected to index
        r = self._rejected[zips[0]]
        for i in range(start_index):
            if i not in r:
                index += 1

        if index != 0 and self._validator != noop:
            raise RuntimeError('found index may be incorrect while validating')

        return start_index

[docs]    def find_key(self, index):
        if index != 0 and self._validator != noop:
            raise RuntimeError('found key may be incorrect while validating')

        zips, start_index, _ = _find_start(
            self._path, self._rejected, start_index=index
        )
        gen = _yield_from_zips(
            self._path, zips, self._rejected, start_index, self._validator,
        )
        sample, key, _, _ = next(gen)
        return key

[docs]    def seek_index(self, index):
        if index != 0 and self._validator != noop:
            raise RuntimeError('can only seek to start while validating')

        zips, start_index, _ = _find_start(
            self._path, self._rejected, start_index=index
        )

        self._gen = _yield_from_zips(
            self._path, zips, self._rejected, start_index, self._validator,
        )
        self._i = index

[docs]    def seek_key(self, key):
        zips, start_index, index = _find_start(
            self._path, self._rejected, start_key=key
        )

        # count up to start_index in zip and all samples
        # that are not in rejected to index
        r = self._rejected[zips[0]]
        for i in range(start_index):
            if i not in r:
                index += 1

        if index != 0 and self._validator != noop:
            raise RuntimeError('can only seek to start while validating')

        self._gen = _yield_from_zips(
            self._path, zips, self._rejected, start_index, self._validator,
        )
        self._i = index

[docs]    def get(self, index, yield_key=False, raw=False, copy=True):
        if index != 0 and self._validator != noop:
            raise RuntimeError('can only seek to start while validating')

        zips, start_index, _ = _find_start(
            self._path, self._rejected, start_index=index
        )

        gen = _yield_from_zips(
            self._path, zips, self._rejected, start_index, self._validator,
        )
        return self._get_next_sample(gen)

    def _iter_impl(
            self,
            start=None,
            stop=None,
            yield_key=False,
            raw=False,
            copy=True,
            chunk_size=16,
    ):
        if start != 0 and self._validator != noop:
            raise RuntimeError('can only seek to start while validating')

        start, stop, _ = slice(start, stop).indices(len(self))

        zips, start_index, _ = _find_start(
            self._path, self._rejected, start_index=start
        )
        gen = _yield_from_zips(
            self._path, zips, self._rejected, start_index, self._validator,
        )

        if raw:
            pack = self._packer.pack
        else:
            pack = noop

        if yield_key:
            for i in range(start, stop):
                sample = self._get_next_sample(gen)
                self._i = i
                yield sample['key'], pack(sample)
        else:
            for i in range(start, stop):
                sample = self._get_next_sample(gen)
                self._i = i
                yield pack(sample)

[docs]    def slice(self, start, stop=None, yield_key=False, raw=False, copy=True):
        return self._iter_impl(start, stop, yield_key, raw, copy)


[docs]def main():
    import argparse

    from ..tools import make_printer
    from ..tools import print_over

    parser = argparse.ArgumentParser(
        description='Load and decode every image from given image packs. '
                    'If an image either does not decode properly or does '
                    'not contain useful content, its containing zip file '
                    'and name are written to the reject file.')
    parser.add_argument(
        'image_packs',
        type=str,
        help='path to directory of image zip files',
    )
    parser.add_argument(
        '-r', '--rejectfile',
        type=str,
        help='path to rejected images log file',
    )

    args = parser.parse_args()

    printer = make_printer(total=100000000)
    reader = YFCC100mReader(
        args.image_packs,
        validator=validate_image,
        reject_file_paths=(),
        error_file=args.rejectfile
    )
    for key, data in reader.iter(yield_key=True):
        if data['image'] is None:
            print('rejected', key)
        printer.update()
    print_over(printer.total_updates, 'images passed testing')


if __name__ == '__main__':
    main()