Module hub.core.chunk_engine.chunker
Expand source code
import numpy as np
from typing import Generator, Optional, List
from hub.util.exceptions import ChunkSizeTooSmallError
def generate_chunks(
content_bytes: bytes,
chunk_size: int,
bytes_left_in_last_chunk: int = 0,
) -> Generator[bytes, None, None]:
"""Generator function that chunks bytes.
Chunking is the process of taking the input `content_bytes` and breaking it up into a sequence of smaller bytes called "chunks".
The sizes of each chunk are <= `chunk_size`.
Example:
content_bytes = b"1234567890123"
chunk_size = 4
yields:
b"1234"
b"5678"
b"9012"
b"3"
Args:
content_bytes (bytes): Bytes object with the data to be chunked.
chunk_size (int): Each individual chunk will be assigned this many bytes maximum.
bytes_left_in_last_chunk (int): If chunks were created already, `bytes_left_in_last_chunk`
should be set to the `chunk_size - len(last_chunk)`. This is so the generator's
first output will be enough bytes to fill that chunk up to `chunk_size`.
Yields:
bytes: Chunk of the `content_bytes`. Will have length on the interval (1, `chunk_size`].
Raises:
ChunkSizeTooSmallError: If `chunk_size` <= 0
ValueError: If `bytes_left_in_last_chunk` < 0
"""
# validate inputs
if chunk_size <= 0:
raise ChunkSizeTooSmallError()
if bytes_left_in_last_chunk < 0:
# TODO: move error message to separate file
raise ValueError("Bytes left in last chunk must be >= 0.")
if len(content_bytes) <= 0:
return
# yield the remainder of the last chunk (provided as `last_chunk_num_bytes`)
total_bytes_yielded = 0
if bytes_left_in_last_chunk > 0:
chunk = content_bytes[:bytes_left_in_last_chunk]
yield chunk
total_bytes_yielded += bytes_left_in_last_chunk
# yield all new chunks
while total_bytes_yielded < len(content_bytes):
end = total_bytes_yielded + chunk_size
chunk = content_bytes[total_bytes_yielded:end]
yield chunk
total_bytes_yielded += len(chunk)
def join_chunks(chunks: List[bytes], start_byte: int, end_byte: int) -> bytes:
"""Given a list of bytes that represent sequential chunks, join them into one bytes object.
For more on chunking, see the `generate_chunks` method.
Example:
chunks = [b"123", b"456", b"789"]
start_byte = 1
end_byte = 2
returns:
b"2345678"
Args:
chunks (list[bytes]): Sequential list of bytes objects that represent chunks.
start_byte (int): The first chunk in the sequence will ignore the bytes before `start_byte`. If 0, all bytes are included.
end_byte (int): The last chunk in the sequence will ignore the bytes at and after `end_byte-1`. If None, all bytes are included.
Notes:
Bytes are indexed using: chunk[start_byte:end_byte]. That is why `chunk[end_byte]` will not be included in `chunk[start_byte:end_byte]`.
If `len(chunks) == 1`, `start_byte`:`end_byte` will be applied to the same chunk (the first and last one).
Returns:
bytes: The chunks joined as one bytes object.
"""
indexed_chunks = []
for i, chunk in enumerate(chunks):
actual_start_byte, actual_end_byte = 0, len(chunk)
if i <= 0:
actual_start_byte = start_byte
if i >= len(chunks) - 1:
actual_end_byte = end_byte
indexed_chunks.append(chunk[actual_start_byte:actual_end_byte])
return b"".join(indexed_chunks)
Functions
def generate_chunks(content_bytes: bytes, chunk_size: int, bytes_left_in_last_chunk: int = 0) ‑> Generator[bytes, NoneType, NoneType]
-
Generator function that chunks bytes.
Chunking is the process of taking the input
content_bytes
and breaking it up into a sequence of smaller bytes called "chunks". The sizes of each chunk are <=chunk_size
.Example
content_bytes = b"1234567890123" chunk_size = 4 yields: b"1234" b"5678" b"9012" b"3"
Args
content_bytes
:bytes
- Bytes object with the data to be chunked.
chunk_size
:int
- Each individual chunk will be assigned this many bytes maximum.
bytes_left_in_last_chunk
:int
- If chunks were created already,
bytes_left_in_last_chunk
should be set to thechunk_size - len(last_chunk)
. This is so the generator's first output will be enough bytes to fill that chunk up tochunk_size
.
Yields
bytes
- Chunk of the
content_bytes
. Will have length on the interval (1,chunk_size
].
Raises
ChunkSizeTooSmallError
- If
chunk_size
<= 0 ValueError
- If
bytes_left_in_last_chunk
< 0
Expand source code
def generate_chunks( content_bytes: bytes, chunk_size: int, bytes_left_in_last_chunk: int = 0, ) -> Generator[bytes, None, None]: """Generator function that chunks bytes. Chunking is the process of taking the input `content_bytes` and breaking it up into a sequence of smaller bytes called "chunks". The sizes of each chunk are <= `chunk_size`. Example: content_bytes = b"1234567890123" chunk_size = 4 yields: b"1234" b"5678" b"9012" b"3" Args: content_bytes (bytes): Bytes object with the data to be chunked. chunk_size (int): Each individual chunk will be assigned this many bytes maximum. bytes_left_in_last_chunk (int): If chunks were created already, `bytes_left_in_last_chunk` should be set to the `chunk_size - len(last_chunk)`. This is so the generator's first output will be enough bytes to fill that chunk up to `chunk_size`. Yields: bytes: Chunk of the `content_bytes`. Will have length on the interval (1, `chunk_size`]. Raises: ChunkSizeTooSmallError: If `chunk_size` <= 0 ValueError: If `bytes_left_in_last_chunk` < 0 """ # validate inputs if chunk_size <= 0: raise ChunkSizeTooSmallError() if bytes_left_in_last_chunk < 0: # TODO: move error message to separate file raise ValueError("Bytes left in last chunk must be >= 0.") if len(content_bytes) <= 0: return # yield the remainder of the last chunk (provided as `last_chunk_num_bytes`) total_bytes_yielded = 0 if bytes_left_in_last_chunk > 0: chunk = content_bytes[:bytes_left_in_last_chunk] yield chunk total_bytes_yielded += bytes_left_in_last_chunk # yield all new chunks while total_bytes_yielded < len(content_bytes): end = total_bytes_yielded + chunk_size chunk = content_bytes[total_bytes_yielded:end] yield chunk total_bytes_yielded += len(chunk)
def join_chunks(chunks: List[bytes], start_byte: int, end_byte: int) ‑> bytes
-
Given a list of bytes that represent sequential chunks, join them into one bytes object. For more on chunking, see the
generate_chunks()
method.Example
chunks = [b"123", b"456", b"789"] start_byte = 1 end_byte = 2 returns: b"2345678"
Args
chunks
:list[bytes]
- Sequential list of bytes objects that represent chunks.
start_byte
:int
- The first chunk in the sequence will ignore the bytes before
start_byte
. If 0, all bytes are included. end_byte
:int
- The last chunk in the sequence will ignore the bytes at and after
end_byte-1
. If None, all bytes are included.
Notes
Bytes are indexed using: chunk[start_byte:end_byte]. That is why
chunk[end_byte]
will not be included inchunk[start_byte:end_byte]
. Iflen(chunks) == 1
,start_byte
:end_byte
will be applied to the same chunk (the first and last one).Returns
bytes
- The chunks joined as one bytes object.
Expand source code
def join_chunks(chunks: List[bytes], start_byte: int, end_byte: int) -> bytes: """Given a list of bytes that represent sequential chunks, join them into one bytes object. For more on chunking, see the `generate_chunks` method. Example: chunks = [b"123", b"456", b"789"] start_byte = 1 end_byte = 2 returns: b"2345678" Args: chunks (list[bytes]): Sequential list of bytes objects that represent chunks. start_byte (int): The first chunk in the sequence will ignore the bytes before `start_byte`. If 0, all bytes are included. end_byte (int): The last chunk in the sequence will ignore the bytes at and after `end_byte-1`. If None, all bytes are included. Notes: Bytes are indexed using: chunk[start_byte:end_byte]. That is why `chunk[end_byte]` will not be included in `chunk[start_byte:end_byte]`. If `len(chunks) == 1`, `start_byte`:`end_byte` will be applied to the same chunk (the first and last one). Returns: bytes: The chunks joined as one bytes object. """ indexed_chunks = [] for i, chunk in enumerate(chunks): actual_start_byte, actual_end_byte = 0, len(chunk) if i <= 0: actual_start_byte = start_byte if i >= len(chunks) - 1: actual_end_byte = end_byte indexed_chunks.append(chunk[actual_start_byte:actual_end_byte]) return b"".join(indexed_chunks)