etc

How to process large files in parallel, byte by byte

김승목 2025. 5. 15. 17:50

import os
import mmap
import concurrent.futures

def xor_chunk(start, size, mm, xor_key):
    for i in range(start, start + size):
        mm[i] ^= xor_key

def xor_file_mmap_parallel(input_path, output_path, xor_key=3, chunk_size=1024 * 1024, max_workers=4):
    total_size = os.path.getsize(input_path)

    with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
        fout.write(fin.read())

    with open(output_path, 'r+b') as f:
        mm = mmap.mmap(f.fileno(), 0)

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for start in range(0, total_size, chunk_size):
                size = min(chunk_size, total_size - start)
                futures.append(executor.submit(xor_chunk, start, size, mm, xor_key))

            for i, future in enumerate(concurrent.futures.as_completed(futures), 1):
                print(f"\r{i}/{len(futures)} chunks processed", end='', flush=True)

        mm.flush()
        mm.close()

    print(f"\nDone. Total {total_size} bytes processed.")