""" Try to estimate if zstd's internal multithreading is useful within the context of compressing borgbackup chunks. The chunks of data yielded by borg's chunker are usually ~2MiB in size (if the file is larger than that). Smaller files often only yield 1 chunk with the whole file content in it and these chunks can be arbitrarily small, down to 1B. """ import os import random import time from zstd import ZSTD_compress def generate_random_data(): # return chunks with a size around 2MiB. # return fully random data assuming that this leads to # maximum compression time (and thus giving multi-threading # an advantage). size = random.randint(512 * 1024, 4096 * 1024) # 0.5 .. 2 MiB return os.urandom(size) def generate_large_data(): max_length = 10 * 2**30 generated_length = 0 while generated_length < max_length: data = generate_random_data() yield data generated_length += len(data) def compress_large_data(level, workers): dlen = clen = 0 for data in generate_large_data(): data_compressed = ZSTD_compress(data, level, workers) dlen += len(data) clen += len(data_compressed) return dlen, clen def measure_execution_time(level, workers): start_time = time.time() dlen, clen = compress_large_data(level=level, workers=workers) end_time = time.time() execution_time = end_time - start_time print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.1f}s") for level in [1, 3, 6, 10, 15, 20]: for workers in [0, 1, 2, 4]: measure_execution_time(level, workers)