""" Try to estimate if zstd's internal multithreading is useful within the context of compressing borgbackup chunks. The chunks of data yielded by borg's chunker are usually ~2MiB in size (if the file is larger than that). Smaller files often only yield 1 chunk with the whole file content in it and these chunks can be arbitrarily small, down to 1B. """ import os import random import time from pyzstd import CParameter, compress, zstd_support_multithread def generate_random_data(): # return chunks with a size around 2MiB. # return fully random data assuming that this leads to # maximum compression time (and thus giving multi-threading # an advantage). size = random.randint(512 * 1024, 4096 * 1024) # 0.5 .. 2 MiB return os.urandom(size) def generate_large_data(): max_length = 10 * 2**30 generated_length = 0 while generated_length < max_length: data = generate_random_data() yield data generated_length += len(data) def compress_large_data(level=1, workers=0): options = {CParameter.compressionLevel: level, CParameter.nbWorkers: workers, CParameter.jobSize: 512 * 1024, # min==512kiB } dlen = clen = 0 for data in generate_large_data(): data_compressed = compress(data, options) dlen += len(data) clen += len(data_compressed) return dlen, clen def measure_execution_time(level, workers=0): # workers == 0 means not to create worker threads # workers == 1 means threading, but only 1 worker # workers == 2+ means real multithreading, N workers start_time = time.time() dlen, clen = compress_large_data(level=level, workers=workers) end_time = time.time() execution_time = end_time - start_time print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.3f}s") assert zstd_support_multithread for level in [1, 3, 6, 10, 15, 20]: for workers in [0, 1, 2, 4]: measure_execution_time(level, workers)