zstd-mt-test.py

QR Download Inline

Type: text/x-python3, Size: 1722 bytes, SHA256: a6637b0cd3b777cafa4aad76ab81e963b414f762858e091b0ccb5078a647eb28.
UTC timestamps: upload: 2024-06-15 22:01:46, download: 2025-07-10 02:02:13, max lifetime: 2029-06-14 22:01:46.

"""
Try to estimate if zstd's internal multithreading is useful
within the context of compressing borgbackup chunks.

The chunks of data yielded by borg's chunker are usually ~2MiB
in size (if the file is larger than that). Smaller files often
only yield 1 chunk with the whole file content in it and these
chunks can be arbitrarily small, down to 1B.
"""
import os
import random
import time
from zstd import ZSTD_compress


def generate_random_data():
    # return chunks with a size around 2MiB.
    # return fully random data assuming that this leads to
    # maximum compression time (and thus giving multi-threading
    # an advantage).
    size = random.randint(512 * 1024, 4096 * 1024)  # 0.5 .. 2 MiB
    return os.urandom(size)


def generate_large_data():
    max_length = 10 * 2**30
    generated_length = 0
    while generated_length < max_length:
        data = generate_random_data()
        yield data
        generated_length += len(data)


def compress_large_data(level, workers):
    dlen = clen = 0
    for data in generate_large_data():
        data_compressed = ZSTD_compress(data, level, workers)
        dlen += len(data)
        clen += len(data_compressed)
    return dlen, clen


def measure_execution_time(level, workers):
    start_time = time.time()
    dlen, clen = compress_large_data(level=level, workers=workers)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.1f}s")


for level in [1, 3, 6, 10, 15, 20]:
    for workers in [0, 1, 2, 4]:
        measure_execution_time(level, workers)