"""
Try to estimate if zstd's internal multithreading is useful
within the context of compressing borgbackup chunks.
The chunks of data yielded by borg's chunker are usually ~2MiB
in size (if the file is larger than that). Smaller files often
only yield 1 chunk with the whole file content in it and these
chunks can be arbitrarily small, down to 1B.
"""
import os
import random
import time
from pyzstd import CParameter, compress, zstd_support_multithread
def generate_random_data():
# return chunks with a size around 2MiB.
# return fully random data assuming that this leads to
# maximum compression time (and thus giving multi-threading
# an advantage).
size = random.randint(512 * 1024, 4096 * 1024) # 0.5 .. 2 MiB
return os.urandom(size)
def generate_large_data():
max_length = 10 * 2**30
generated_length = 0
while generated_length < max_length:
data = generate_random_data()
yield data
generated_length += len(data)
def compress_large_data(level=1, workers=0):
options = {CParameter.compressionLevel: level,
CParameter.nbWorkers: workers,
CParameter.jobSize: 512 * 1024, # min==512kiB
}
dlen = clen = 0
for data in generate_large_data():
data_compressed = compress(data, options)
dlen += len(data)
clen += len(data_compressed)
return dlen, clen
def measure_execution_time(level, workers=0):
# workers == 0 means not to create worker threads
# workers == 1 means threading, but only 1 worker
# workers == 2+ means real multithreading, N workers
start_time = time.time()
dlen, clen = compress_large_data(level=level, workers=workers)
end_time = time.time()
execution_time = end_time - start_time
print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.3f}s")
assert zstd_support_multithread
for level in [1, 3, 6, 10, 15, 20]:
for workers in [0, 1, 2, 4]:
measure_execution_time(level, workers)