Source code for datadings.index

from typing import Sequence
from typing import Tuple

from pathlib import Path
import logging

from ..tools import path_append_suffix
from ..tools import hash_string_bytes
from ..tools import hash_string
from ..tools.msgpack import make_unpacker
from ..tools.msgpack import unpack
from ..tools.msgpack import unpackb
from ..tools.msgpack import pack

import numpy as np
from simplebloom import BloomFilter


SUFFIX_LEGACY_INDEX = '.index'
SUFFIX_KEYS = '.keys'
SUFFIX_KEY_HASHES = '.key_hashes'
SUFFIX_FILTER = '.filter'
SUFFIX_OFFSETS = '.offsets'


[docs]def keys_len(path: Path) -> int: """ Read the dataset length from the keys file. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or keys file Returns: length of dataset """ with path_append_suffix(path, SUFFIX_KEYS).open('rb') as f: return make_unpacker(f, read_size=5).read_array_header()
[docs]def load_keys(path: Path) -> Sequence[str]: """ Load keys from file. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or keys file Returns: list of keys """ with path_append_suffix(path, SUFFIX_KEYS).open('rb') as f: return unpack(f)
[docs]def load_key_hashes(path: Path) -> Tuple[bytes, Sequence[int]]: """ Load key hashes from file. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or key hashes file Returns: hash salt and list of key hashes """ with path_append_suffix(path, SUFFIX_KEY_HASHES).open('rb') as f: salt = f.read(8) return salt, np.fromfile(f, dtype=np.dtype('>u8')).astype(np.uint64)
[docs]def load_filter(path: Path) -> BloomFilter: """ Load a Bloom filter from file. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or filter file Returns: the Bloom filter """ with path_append_suffix(path, SUFFIX_FILTER).open('rb') as f: return BloomFilter.load(f)
[docs]def load_offsets(path: Path) -> Sequence[int]: """ Load sample offsets from file. First value is always 0 and last is size of data file in bytes, so ``len(offsets) = len(dataset) + 1``. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or offsets file Returns: sample offsets in data file """ return np.fromfile(path_append_suffix(path, SUFFIX_OFFSETS), dtype=np.dtype('>u8')).astype(np.uint64)
[docs]def legacy_index_len(path: Path) -> int: """ Read the dataset length from the legacy index file. Correct suffix is appended if path ends with a different suffix. Parameters: path: path to data or index file Returns: length of dataset """ with path_append_suffix(path, SUFFIX_LEGACY_INDEX).open('rb') as f: return make_unpacker(f, read_size=5).read_map_header()
[docs]def legacy_load_index(path: Path) -> Tuple[Sequence[str], Sequence[int]]: """ Load legacy index as two lists of keys and offsets. Semantics of the returned lists are the same as for ``load_keys`` and ``load_offsets``. Correct suffix is appended if path ends with a different suffix. Parameters: path: Path to dataset or index file Returns: keys and offsets list """ with path_append_suffix(path, SUFFIX_LEGACY_INDEX).open('rb', 0) as f: data = f.read() pairs = unpackb(data, object_hook=None, object_pairs_hook=list) positions = [p for _, p in pairs] positions.append(path.stat().st_size) return [k for k, _ in pairs], positions
[docs]def write_offsets(offsets: Sequence[int], path: Path) -> Path: """ Write list of offsets to file. Correct suffix is appended if path ends with a different suffix. Parameters: offsets: list of offsets path: path to data or offsets file Returns: Path that was written to """ offsets = np.array(offsets, dtype=np.dtype('>u8')) path = path_append_suffix(path, SUFFIX_OFFSETS) with path.open('wb') as f: f.write(memoryview(offsets)) return path
[docs]def write_keys(keys: Sequence[str], path: Path) -> Path: """ Write list of offsets to file. Correct suffix is appended if path ends with a different suffix. Parameters: keys: list of keys path: path to data or keys file Returns: Path that was written to """ path = path_append_suffix(path, SUFFIX_KEYS) with path.open('wb') as f: pack(keys, f) return path
[docs]def hash_keys(keys: Sequence[str], max_tries: int = 1000) -> Tuple[bytes, Sequence[int]]: """ Apply the :py:func:`hash_string` function to the given list of keys, so the returned hashes are 64 bit integers. All hashes are salted and guaranteed collision free. If necessary this method will try different salt values Parameters: keys: list of keys max_tries: how many different salt values to try to find collision-free hashes Returns: used salt and list of key hashes """ hashes = np.zeros(len(keys), dtype=np.dtype('>u8')) salt_int = 0 # change the salt until there are no more hash collisions for tri in range(max_tries): salt = hash_string_bytes(str(salt_int)) seen = set() for i, key in enumerate(keys): h = hash_string(key, salt=salt) if h in seen: logging.info('hash collision, retry with different salt') salt_int += 1 break seen.add(h) hashes[i] = h else: return salt, hashes raise RuntimeError( f'hash collisions after {max_tries} tries;' 'try increasing max_tries' )
[docs]def write_key_hashes(keys: Sequence[str], path: Path) -> Path: """ Hash list of keys and write result to file. See ``hash_keys`` for details on hash method. Correct suffix is appended if path ends with a different suffix. Parameters: keys: list of keys path: path to data or offsets file Returns: Path that was written to """ salt, hashes = hash_keys(keys) path = path_append_suffix(path, SUFFIX_KEY_HASHES) with path.open('wb') as f: f.write(salt) f.write(memoryview(hashes)) return path
[docs]def write_filter(keys: Sequence[str], path: Path) -> Path: """ Create a Bloom filter for the given keys and write result to file. Correct suffix is appended if path ends with a different suffix. Parameters: keys: list of keys path: path to data or filter file Returns: Path that was written to """ bf = BloomFilter(max(2, len(keys))) for k in keys: bf += k path = path_append_suffix(path, SUFFIX_FILTER) with path.open('wb') as f: bf.dump(f) return path