Type: text/x-python3, Size: 1722 bytes, SHA256: a6637b0cd3b777cafa4aad76ab81e963b414f762858e091b0ccb5078a647eb28.
UTC timestamps: upload: 2024-06-15 22:01:46, download: 2024-06-29 12:39:35, max lifetime: 2029-06-14 22:01:46.

"""

Try to estimate if zstd's internal multithreading is useful

within the context of compressing borgbackup chunks.

The chunks of data yielded by borg's chunker are usually ~2MiB

in size (if the file is larger than that). Smaller files often

only yield 1 chunk with the whole file content in it and these

chunks can be arbitrarily small, down to 1B.

"""

import os

import random

import time

from zstd import ZSTD_compress

def generate_random_data():

# return chunks with a size around 2MiB.

# return fully random data assuming that this leads to

# maximum compression time (and thus giving multi-threading

# an advantage).

size = random.randint(512 * 1024, 4096 * 1024) # 0.5 .. 2 MiB

return os.urandom(size)

def generate_large_data():

max_length = 10 * 2**30

generated_length = 0

while generated_length < max_length:

data = generate_random_data()

yield data

generated_length += len(data)

def compress_large_data(level, workers):

dlen = clen = 0

for data in generate_large_data():

data_compressed = ZSTD_compress(data, level, workers)

dlen += len(data)

clen += len(data_compressed)

return dlen, clen

def measure_execution_time(level, workers):

start_time = time.time()

dlen, clen = compress_large_data(level=level, workers=workers)

end_time = time.time()

execution_time = end_time - start_time

print(f"level: {level}, workers: {workers}, len: {dlen/1e9:.3f}GB, len compressed: {clen/1e9:.3f}GB, time: {execution_time:.1f}s")

for level in [1, 3, 6, 10, 15, 20]:

for workers in [0, 1, 2, 4]:

measure_execution_time(level, workers)