pyzstd-mt-test.py

QR Download Inline

Type: text/x-python3, Size: 2133 bytes, SHA256: bc6dc876083a8cae4fed392fddf28da027e4cfbcc9d37c153863fcb98e3d3fe9.
UTC timestamps: upload: 2024-06-15 20:20:47, download: 2025-07-03 04:21:09, max lifetime: 2029-06-14 20:20:47.

"""
Try to estimate if zstd's internal multithreading is useful
within the context of compressing borgbackup chunks.

The chunks of data yielded by borg's chunker are usually ~2MiB
in size (if the file is larger than that). Smaller files often
only yield 1 chunk with the whole file content in it and these
chunks can be arbitrarily small, down to 1B.
"""
import os
import random
import time

from pyzstd import CParameter, compress, zstd_support_multithread


def generate_random_data():
    # return chunks with a size around 2MiB.
    # return fully random data assuming that this leads to
    # maximum compression time (and thus giving multi-threading
    # an advantage).
    size = random.randint(512 * 1024, 4096 * 1024)  # 0.5 .. 2 MiB
    return os.urandom(size)


def generate_large_data():
    max_length = 10 * 2**30
    generated_length = 0
    while generated_length < max_length:
        data = generate_random_data()
        yield data
        generated_length += len(data)


def compress_large_data(level=1, workers=0):
    options = {CParameter.compressionLevel: level,
               CParameter.nbWorkers: workers,
               CParameter.jobSize: 512 * 1024,  # min==512kiB
              }
    dlen = clen = 0
    for data in generate_large_data():
        data_compressed = compress(data, options)
        dlen += len(data)
        clen += len(data_compressed)
    return dlen, clen

def measure_execution_time(level, workers=0):
    # workers == 0 means not to create worker threads
    # workers == 1 means threading, but only 1 worker
    # workers == 2+ means real multithreading, N workers
    start_time = time.time()
    dlen, clen = compress_large_data(level=level, workers=workers)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.3f}s")


assert zstd_support_multithread

for level in [1, 3, 6, 10, 15, 20]:
    for workers in [0, 1, 2, 4]:
        measure_execution_time(level, workers)