Module hub.core.chunk_engine.tests.test_chunker
Expand source code
import pytest
from hub.core.chunk_engine import generate_chunks, join_chunks
from typing import List, Optional, Tuple
# chunk_size,bytes_batch,expected_chunks
PERFECT_FIT: Tuple = (
(1, [b"1"], [b"1"]),
(1, [b"1", b"2"], [b"1", b"2"]),
(1, [b"1", b"2", b"3"], [b"1", b"2", b"3"]),
(1, [b"1234"], [b"1", b"2", b"3", b"4"]),
(4, [b"1234"], [b"1234"]),
(4, [b"1234", b"5678"], [b"1234", b"5678"]),
(4, [b"12345678"], [b"1234", b"5678"]),
(10, [b"12", b"3456", b"78", b"9", b"0"], [b"1234567890"]),
)
# chunk_size,bytes_batch,expected_chunks
PARTIAL_FIT: Tuple = (
(1, [b""], []),
(2, [b"1"], [b"1"]),
(2, [b"123"], [b"12", b"3"]),
(4, [b"123"], [b"123"]),
(4, [b"1234567"], [b"1234", b"567"]),
(4, [b"1234567"], [b"1234", b"567"]),
(8, [b"1", b"2", b"3", b"4", b"5", b"6", b"7"], [b"1234567"]),
)
# chunks,start_byte,end_byte,expected_bytes
JOIN_CHUNKS: Tuple = (
([], 0, None, b""),
([b"123"], 0, None, b"123"),
([b"123"], 0, 2, b"12"),
([b"123"], 0, 1, b"1"),
([b"1", b"2", b"3"], 0, None, b"123"),
# end_byte=2 which is larger than the length of the last chunk (=1)
([b"1", b"2", b"3"], 0, 2, b"123"),
([b"1", b"2", b"3456789"], 0, 2, b"1234"),
([b"1", b"23", b"4567890"], 0, 2, b"12345"),
([b"1234567890"], 0, 5, b"12345"),
([b"12345678", b"12345678", b"12345678"], 2, 5, b"3456781234567812345"),
)
@pytest.mark.parametrize("chunk_size,bytes_batch,expected_chunks", PERFECT_FIT)
def test_generate_perfect_fit(
chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes]
):
"""All output chunks should be equal in length to `chunk_size`."""
run_generate_chunks_test(chunk_size, bytes_batch, expected_chunks)
@pytest.mark.parametrize("chunk_size,bytes_batch,expected_chunks", PARTIAL_FIT)
def test_generate_partial_fit(
chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes]
):
"""Output chunks may differ in length to `chunk_size`."""
run_generate_chunks_test(chunk_size, bytes_batch, expected_chunks)
@pytest.mark.parametrize("chunks,start_byte,end_byte,expected_bytes", JOIN_CHUNKS)
def test_join_chunks(
chunks: List[bytes], start_byte: int, end_byte: int, expected_bytes: bytes
):
actual_bytes = join_chunks(chunks, start_byte, end_byte)
assert actual_bytes == expected_bytes
def run_generate_chunks_test(
chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes]
):
"""
This method iterates through the `chunk_generator(...)` & keeps a running list of the chunks.
When a chunk is yielded, it either adds it to the end of the previous chunk yielded (if it
exists & is not full) or creates a new chunk.
"""
actual_chunks: List[bytearray] = []
bytes_left_in_last_chunk: int = 0
for bytes_object in bytes_batch:
for chunk_bytes in generate_chunks(
bytes_object,
chunk_size,
bytes_left_in_last_chunk=bytes_left_in_last_chunk,
):
chunk = bytearray(chunk_bytes)
# fill last chunk if possible, otherwise create new chunk
if len(actual_chunks) <= 0 or len(actual_chunks[-1]) >= chunk_size:
actual_chunks.append(chunk)
else:
actual_chunks[-1].extend(chunk)
bytes_left_in_last_chunk = chunk_size - len(chunk)
assert actual_chunks == expected_chunks
Functions
def run_generate_chunks_test(chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes])
-
This method iterates through the
chunk_generator(…)
& keeps a running list of the chunks.When a chunk is yielded, it either adds it to the end of the previous chunk yielded (if it exists & is not full) or creates a new chunk.
Expand source code
def run_generate_chunks_test( chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes] ): """ This method iterates through the `chunk_generator(...)` & keeps a running list of the chunks. When a chunk is yielded, it either adds it to the end of the previous chunk yielded (if it exists & is not full) or creates a new chunk. """ actual_chunks: List[bytearray] = [] bytes_left_in_last_chunk: int = 0 for bytes_object in bytes_batch: for chunk_bytes in generate_chunks( bytes_object, chunk_size, bytes_left_in_last_chunk=bytes_left_in_last_chunk, ): chunk = bytearray(chunk_bytes) # fill last chunk if possible, otherwise create new chunk if len(actual_chunks) <= 0 or len(actual_chunks[-1]) >= chunk_size: actual_chunks.append(chunk) else: actual_chunks[-1].extend(chunk) bytes_left_in_last_chunk = chunk_size - len(chunk) assert actual_chunks == expected_chunks
def test_generate_partial_fit(chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes])
-
Output chunks may differ in length to
chunk_size
.Expand source code
@pytest.mark.parametrize("chunk_size,bytes_batch,expected_chunks", PARTIAL_FIT) def test_generate_partial_fit( chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes] ): """Output chunks may differ in length to `chunk_size`.""" run_generate_chunks_test(chunk_size, bytes_batch, expected_chunks)
def test_generate_perfect_fit(chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes])
-
All output chunks should be equal in length to
chunk_size
.Expand source code
@pytest.mark.parametrize("chunk_size,bytes_batch,expected_chunks", PERFECT_FIT) def test_generate_perfect_fit( chunk_size: int, bytes_batch: List[bytes], expected_chunks: List[bytes] ): """All output chunks should be equal in length to `chunk_size`.""" run_generate_chunks_test(chunk_size, bytes_batch, expected_chunks)
def test_join_chunks(chunks: List[bytes], start_byte: int, end_byte: int, expected_bytes: bytes)
-
Expand source code
@pytest.mark.parametrize("chunks,start_byte,end_byte,expected_bytes", JOIN_CHUNKS) def test_join_chunks( chunks: List[bytes], start_byte: int, end_byte: int, expected_bytes: bytes ): actual_bytes = join_chunks(chunks, start_byte, end_byte) assert actual_bytes == expected_bytes