"""
Try to estimate if zstd's internal multithreading is useful
within the context of compressing borgbackup chunks.
The chunks of data yielded by borg's chunker are usually ~2MiB
in size (if the file is larger than that). Smaller files often
only yield 1 chunk with the whole file content in it and these
chunks can be arbitrarily small, down to 1B.
"""
import os
import random
import time
from zstd import ZSTD_compress
def generate_random_data():
# return chunks with a size around 2MiB.
# return fully random data assuming that this leads to
# maximum compression time (and thus giving multi-threading
# an advantage).
size = random.randint(512 * 1024, 4096 * 1024) # 0.5 .. 2 MiB
return os.urandom(size)
def generate_large_data():
max_length = 10 * 2**30
generated_length = 0
while generated_length < max_length:
data = generate_random_data()
yield data
generated_length += len(data)
def compress_large_data(level, workers):
dlen = clen = 0
for data in generate_large_data():
data_compressed = ZSTD_compress(data, level, workers)
dlen += len(data)
clen += len(data_compressed)
return dlen, clen
def measure_execution_time(level, workers):
start_time = time.time()
dlen, clen = compress_large_data(level=level, workers=workers)
end_time = time.time()
execution_time = end_time - start_time
print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.1f}s")
for level in [1, 3, 6, 10, 15, 20]:
for workers in [0, 1, 2, 4]:
measure_execution_time(level, workers)