Source code for datadings.index

from typing import Sequence
from typing import Tuple

from pathlib import Path
import logging

from ..tools import path_append_suffix
from ..tools import hash_string_bytes
from ..tools import hash_string
from ..tools.msgpack import make_unpacker
from ..tools.msgpack import unpack
from ..tools.msgpack import unpackb
from ..tools.msgpack import pack

import numpy as np
from simplebloom import BloomFilter


SUFFIX_LEGACY_INDEX = '.index'
SUFFIX_KEYS = '.keys'
SUFFIX_KEY_HASHES = '.key_hashes'
SUFFIX_FILTER = '.filter'
SUFFIX_OFFSETS = '.offsets'


[docs]def keys_len(path: Path) -> int:
    """
    Read the dataset length from the keys file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or keys file

    Returns:
        length of dataset
    """
    with path_append_suffix(path, SUFFIX_KEYS).open('rb') as f:
        return make_unpacker(f, read_size=5).read_array_header()


[docs]def load_keys(path: Path) -> Sequence[str]:
    """
    Load keys from file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or keys file

    Returns:
        list of keys
    """
    with path_append_suffix(path, SUFFIX_KEYS).open('rb') as f:
        return unpack(f)


[docs]def load_key_hashes(path: Path) -> Tuple[bytes, Sequence[int]]:
    """
    Load key hashes from file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or key hashes file

    Returns:
        hash salt and list of key hashes
    """
    with path_append_suffix(path, SUFFIX_KEY_HASHES).open('rb') as f:
        salt = f.read(8)
        return salt, np.fromfile(f, dtype=np.dtype('>u8')).astype(np.uint64)


[docs]def load_filter(path: Path) -> BloomFilter:
    """
    Load a Bloom filter from file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or filter file

    Returns:
        the Bloom filter
    """
    with path_append_suffix(path, SUFFIX_FILTER).open('rb') as f:
        return BloomFilter.load(f)


[docs]def load_offsets(path: Path) -> Sequence[int]:
    """
    Load sample offsets from file.
    First value is always 0 and last is size of data file in bytes,
    so ``len(offsets) = len(dataset) + 1``.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or offsets file

    Returns:
        sample offsets in data file
    """
    return np.fromfile(path_append_suffix(path, SUFFIX_OFFSETS),
                       dtype=np.dtype('>u8')).astype(np.uint64)


[docs]def legacy_index_len(path: Path) -> int:
    """
    Read the dataset length from the legacy index file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: path to data or index file

    Returns:
        length of dataset
    """
    with path_append_suffix(path, SUFFIX_LEGACY_INDEX).open('rb') as f:
        return make_unpacker(f, read_size=5).read_map_header()


[docs]def legacy_load_index(path: Path) -> Tuple[Sequence[str], Sequence[int]]:
    """
    Load legacy index as two lists of keys and offsets.
    Semantics of the returned lists are the same as for
    ``load_keys`` and ``load_offsets``.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        path: Path to dataset or index file

    Returns:
        keys and offsets list
    """
    with path_append_suffix(path, SUFFIX_LEGACY_INDEX).open('rb', 0) as f:
        data = f.read()
    pairs = unpackb(data, object_hook=None, object_pairs_hook=list)
    positions = [p for _, p in pairs]
    positions.append(path.stat().st_size)
    return [k for k, _ in pairs], positions


[docs]def write_offsets(offsets: Sequence[int], path: Path) -> Path:
    """
    Write list of offsets to file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        offsets: list of offsets
        path: path to data or offsets file

    Returns:
        Path that was written to
    """
    offsets = np.array(offsets, dtype=np.dtype('>u8'))
    path = path_append_suffix(path, SUFFIX_OFFSETS)
    with path.open('wb') as f:
        f.write(memoryview(offsets))
    return path


[docs]def write_keys(keys: Sequence[str], path: Path) -> Path:
    """
    Write list of offsets to file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        keys: list of keys
        path: path to data or keys file

    Returns:
        Path that was written to
    """
    path = path_append_suffix(path, SUFFIX_KEYS)
    with path.open('wb') as f:
        pack(keys, f)
    return path


[docs]def hash_keys(keys: Sequence[str], max_tries: int = 1000) -> Tuple[bytes, Sequence[int]]:
    """
    Apply the :py:func:`hash_string` function to the given
    list of keys, so the returned hashes are 64 bit integers.
    All hashes are salted and guaranteed collision free.
    If necessary this method will try different salt values

    Parameters:
        keys: list of keys
        max_tries: how many different salt values to try
                   to find collision-free hashes

    Returns:
        used salt and list of key hashes
    """
    hashes = np.zeros(len(keys), dtype=np.dtype('>u8'))
    salt_int = 0
    # change the salt until there are no more hash collisions
    for tri in range(max_tries):
        salt = hash_string_bytes(str(salt_int))
        seen = set()
        for i, key in enumerate(keys):
            h = hash_string(key, salt=salt)
            if h in seen:
                logging.info('hash collision, retry with different salt')
                salt_int += 1
                break
            seen.add(h)
            hashes[i] = h
        else:
            return salt, hashes
    raise RuntimeError(
        f'hash collisions after {max_tries} tries;'
        'try increasing max_tries'
    )


[docs]def write_key_hashes(keys: Sequence[str], path: Path) -> Path:
    """
    Hash list of keys and write result to file.

    See ``hash_keys`` for details on hash method.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        keys: list of keys
        path: path to data or offsets file

    Returns:
        Path that was written to
    """
    salt, hashes = hash_keys(keys)
    path = path_append_suffix(path, SUFFIX_KEY_HASHES)
    with path.open('wb') as f:
        f.write(salt)
        f.write(memoryview(hashes))
    return path


[docs]def write_filter(keys: Sequence[str], path: Path) -> Path:
    """
    Create a Bloom filter for the given keys and write result to file.

    Correct suffix is appended if path ends with a different suffix.

    Parameters:
        keys: list of keys
        path: path to data or filter file

    Returns:
        Path that was written to
    """
    bf = BloomFilter(max(2, len(keys)))
    for k in keys:
        bf += k
    path = path_append_suffix(path, SUFFIX_FILTER)
    with path.open('wb') as f:
        bf.dump(f)
    return path