etc
How to process large files in parallel, byte by byte
김승목
2025. 5. 15. 17:50
import os
import mmap
import concurrent.futures
def xor_chunk(start, size, mm, xor_key):
for i in range(start, start + size):
mm[i] ^= xor_key
def xor_file_mmap_parallel(input_path, output_path, xor_key=3, chunk_size=1024 * 1024, max_workers=4):
total_size = os.path.getsize(input_path)
with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
fout.write(fin.read())
with open(output_path, 'r+b') as f:
mm = mmap.mmap(f.fileno(), 0)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for start in range(0, total_size, chunk_size):
size = min(chunk_size, total_size - start)
futures.append(executor.submit(xor_chunk, start, size, mm, xor_key))
for i, future in enumerate(concurrent.futures.as_completed(futures), 1):
print(f"\r{i}/{len(futures)} chunks processed", end='', flush=True)
mm.flush()
mm.close()
print(f"\nDone. Total {total_size} bytes processed.")