Source code for datadings.reader.reader

from abc import ABCMeta
from abc import abstractmethod

from math import ceil


[docs]class Reader(metaclass=ABCMeta):
    """
    Abstract base class for dataset readers.

    Readers should be used as context managers::

        with Reader(...) as reader:
            for sample in reader:
                [do dataset things]

    Subclasses must implement the following methods:

    * __exit__
    * __len__
    * __contains__
    * find_key
    * find_index
    * get
    * slice
    """
    # attributes that are ignored by __copy__
    _do_not_copy = ()

    def __init__(self):
        self.getitem_max_slice_length = 512
        self.getitem_chunk_size = 64

    @abstractmethod
    def __len__(self):
        pass

    @abstractmethod
    def __contains__(self, key):
        pass

    def __enter__(self):
        return self

    @abstractmethod
    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __copy__(self):
        cls = self.__class__
        reader = cls.__new__(cls)
        reader.__dict__.update(
            (k, v) for k, v in self.__dict__.items()
            if k not in cls._do_not_copy
        )
        return reader

[docs]    @abstractmethod
    def find_key(self, index):
        """
        Returns the key of the sample with the given index.
        """
        pass

[docs]    @abstractmethod
    def find_index(self, key):
        """
        Returns the index of the sample with the given key.
        """
        pass

[docs]    @abstractmethod
    def get(self, index, yield_key=False, raw=False, copy=True):
        """
        Returns sample at given index.

        ``copy=False`` allows the reader to use zero-copy mechanisms.
        Data may be returned as ``memoryview`` objects rather than ``bytes``.
        This can improve performance, but also drastically increase memory
        consumption, since one sample can keep the whole slice in memory.

        Parameters:
            index: Index of the sample
            yield_key: If True, returns (key, sample)
            raw: If True, returns sample as msgpacked message
            copy: if False, allow the reader to return data as
                  ``memoryview`` objects instead of ``bytes``

        Returns:
            Sample as index.
        """
        pass

[docs]    @abstractmethod
    def slice(self, start, stop=None, yield_key=False, raw=False, copy=True):
        """
        Returns a generator of samples selected by the given slice.

        ``copy=False`` allows the reader to use zero-copy mechanisms.
        Data may be returned as ``memoryview`` objects rather than ``bytes``.
        This can improve performance, but also drastically increase memory
        consumption, since one sample can keep the whole slice in memory.

        Parameters:
            start: start index of slice
            stop: stop index of slice
            yield_key: if True, yield (key, sample)
            raw: if True, returns sample as msgpacked message
            copy: if False, allow the reader to return data as
                  ``memoryview`` objects instead of ``bytes``

        Returns:
            Iterator of selected samples
        """
        pass

    def __getitem__(self, index):
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            if step != 1:
                raise ValueError('step must be 1')
            # use iter if number of samples is large
            if stop - start >= self.getitem_max_slice_length:
                return self.iter(start, stop, chunk_size=self.getitem_chunk_size)
            # otherwise use slice directly
            else:
                return self.slice(start, stop)
        else:
            return self.get(index)

    def _iter_impl(
            self,
            start,
            stop,
            yield_key=False,
            raw=False,
            copy=True,
            chunk_size=16,
    ):
        chunks = int(ceil((stop - start) / chunk_size))
        for c in range(chunks):
            a = c * chunk_size + start
            b = min(stop, a + chunk_size)
            yield from self.slice(a, b, yield_key, raw, copy)

[docs]    def iter(
            self,
            start=None,
            stop=None,
            yield_key=False,
            raw=False,
            copy=True,
            chunk_size=16,
    ):
        """
        Iterate over the dataset.

        ``start`` and ``stop`` behave like the parameters of the
        ``range`` function0.

        ``copy=False`` allows the reader to use zero-copy mechanisms.
        Data may be returned as ``memoryview`` objects rather than ``bytes``.
        This can improve performance, but also drastically increase memory
        consumption, since one sample can keep the whole slice in memory.

        Parameters:
            start: start of range; if None, current index is used
            stop: stop of range
            yield_key: if True, yields (key, sample) pairs.
            raw: if True, yields samples as msgpacked messages.
            copy: if False, allow the reader to return data as
                  ``memoryview`` objects instead of ``bytes``
            chunk_size: number of samples read at once;
                        bigger values can increase throughput,
                        but require more memory

        Returns:
            Iterator
        """
        n = len(self)

        if start is None:
            start = 0
        else:
            if start < 0:
                start += n
            if start < 0 or start >= n:
                raise IndexError(f'index {start} out of range for length {n} reader')

        start, stop, _ = slice(start, stop).indices(len(self))
        yield from self._iter_impl(
            start,
            stop,
            yield_key=yield_key,
            raw=raw,
            copy=copy,
            chunk_size=chunk_size,
        )

    def __iter__(self):
        return self.iter()