Source code for multi_threading_file_compression

#!/usr/bin/env python3

import sys
import os
from pymzml.utils.utils import index_gzip
import pymzml
import glob
import multiprocessing


[docs]def main(folder, num_cpus=1): """ Creates indexed gzip mzML files from all mzMLs files in the given folder using a given number of threads. Usage: python multi_threading_file_compression.py <folder> <threads> Note: If the number of threads is larger than the number of actual possible threads, all possible threads will be used. """ max_cpus = multiprocessing.cpu_count() if int(num_cpus) > max_cpus: num_cpus = max_cpus else: num_cpus = int(num_cpus) mzml_job_list = [] for mzml_path in glob.glob(os.path.join(folder, '*.mzML')): out_path = '{0}.gz'.format(mzml_path) if os.path.exists(out_path): print('Skipping: {0}'.format(mzml_path)) continue mzml_job_list.append( ( mzml_path, out_path ) ) print( 'Compressing {0} mzML files using {1} threads'.format( len(mzml_job_list), num_cpus ) ) mp_pool = multiprocessing.Pool( num_cpus ) results = mp_pool.starmap( compress_file, mzml_job_list ) mp_pool.close() print('Done') return
def compress_file(file_path, out_path): print('Working on file {0}'.format(file_path)) with open(file_path) as fin: fin.seek(0, 2) max_offset_len = fin.tell() max_spec_no = pymzml.run.Reader(file_path).get_spectrum_count() + 10 index_gzip( file_path, out_path, max_idx=max_spec_no, idx_len=len(str(max_offset_len)) ) print('Wrote file {0}'.format(out_path)) return if __name__ == '__main__': if len(sys.argv) < 2: print(main.__doc__) exit() else: main(*sys.argv[1:])