Module hub.core.chunk_engine.read

Expand source code
import os
import pickle
import numpy as np

from .chunker import join_chunks

from hub import constants
from hub.util.keys import get_meta_key, get_index_map_key

from hub.core.typing import StorageProvider
from typing import Callable, List, Union


def read_tensor_meta(key: str, storage: StorageProvider):
    return pickle.loads(storage[get_meta_key(key)])


def read_dataset_meta(storage: StorageProvider):
    return pickle.loads(storage[constants.META_FILENAME])


def read_array(
    key: str,
    storage: StorageProvider,
    array_slice: slice = slice(None),
) -> np.ndarray:
    """Read and join chunks into an array from storage.

    Args:
        key (str): Key for where the chunks, index_map, and meta are located in `storage` relative to it's root.
        array_slice (slice): Slice that represents which samples to read. Default = slice representing all samples.
        storage (StorageProvider): StorageProvider for reading the chunks, index_map, and meta.

    Returns:
        np.ndarray: Array containing the sample(s) in the `array_slice` slice.
    """

    # TODO: don't use pickle
    meta = read_tensor_meta(key, storage)
    index_map = pickle.loads(storage[get_index_map_key(key)])

    # TODO: read samples in parallel
    samples = []
    for index_entry in index_map[array_slice]:
        chunks = []
        for chunk_name in index_entry["chunk_names"]:
            chunk_key = os.path.join(key, "chunks", chunk_name)
            chunk = storage[chunk_key]

            chunks.append(chunk)

        combined_bytes = join_chunks(
            chunks,
            index_entry["start_byte"],
            index_entry["end_byte"],
        )

        out_array = np.frombuffer(combined_bytes, dtype=meta["dtype"])
        samples.append(out_array.reshape(index_entry["shape"]))

    return np.array(samples)

Functions

def read_array(key: str, storage: StorageProvider, array_slice: slice = slice(None, None, None)) ‑> numpy.ndarray

Read and join chunks into an array from storage.

Args

key : str
Key for where the chunks, index_map, and meta are located in storage relative to it's root.
array_slice : slice
Slice that represents which samples to read. Default = slice representing all samples.
storage : StorageProvider
StorageProvider for reading the chunks, index_map, and meta.

Returns

np.ndarray
Array containing the sample(s) in the array_slice slice.
Expand source code
def read_array(
    key: str,
    storage: StorageProvider,
    array_slice: slice = slice(None),
) -> np.ndarray:
    """Read and join chunks into an array from storage.

    Args:
        key (str): Key for where the chunks, index_map, and meta are located in `storage` relative to it's root.
        array_slice (slice): Slice that represents which samples to read. Default = slice representing all samples.
        storage (StorageProvider): StorageProvider for reading the chunks, index_map, and meta.

    Returns:
        np.ndarray: Array containing the sample(s) in the `array_slice` slice.
    """

    # TODO: don't use pickle
    meta = read_tensor_meta(key, storage)
    index_map = pickle.loads(storage[get_index_map_key(key)])

    # TODO: read samples in parallel
    samples = []
    for index_entry in index_map[array_slice]:
        chunks = []
        for chunk_name in index_entry["chunk_names"]:
            chunk_key = os.path.join(key, "chunks", chunk_name)
            chunk = storage[chunk_key]

            chunks.append(chunk)

        combined_bytes = join_chunks(
            chunks,
            index_entry["start_byte"],
            index_entry["end_byte"],
        )

        out_array = np.frombuffer(combined_bytes, dtype=meta["dtype"])
        samples.append(out_array.reshape(index_entry["shape"]))

    return np.array(samples)
def read_dataset_meta(storage: StorageProvider)
Expand source code
def read_dataset_meta(storage: StorageProvider):
    return pickle.loads(storage[constants.META_FILENAME])
def read_tensor_meta(key: str, storage: StorageProvider)
Expand source code
def read_tensor_meta(key: str, storage: StorageProvider):
    return pickle.loads(storage[get_meta_key(key)])