Source code for multi_threading_file_compression
#!/usr/bin/env python3
import sys
import os
from pymzml.utils.utils import index_gzip
import pymzml
import glob
import multiprocessing
[docs]def main(folder, num_cpus=1):
"""
Creates indexed gzip mzML files from all mzMLs files in the given folder
using a given number of threads.
Usage:
python multi_threading_file_compression.py <folder> <threads>
Note:
If the number of threads is larger than the number of actual possible
threads, all possible threads will be used.
"""
max_cpus = multiprocessing.cpu_count()
if int(num_cpus) > max_cpus:
num_cpus = max_cpus
else:
num_cpus = int(num_cpus)
mzml_job_list = []
for mzml_path in glob.glob(os.path.join(folder, '*.mzML')):
out_path = '{0}.gz'.format(mzml_path)
if os.path.exists(out_path):
print('Skipping: {0}'.format(mzml_path))
continue
mzml_job_list.append(
(
mzml_path,
out_path
)
)
print(
'Compressing {0} mzML files using {1} threads'.format(
len(mzml_job_list),
num_cpus
)
)
mp_pool = multiprocessing.Pool(
num_cpus
)
results = mp_pool.starmap(
compress_file,
mzml_job_list
)
mp_pool.close()
print('Done')
return
def compress_file(file_path, out_path):
print('Working on file {0}'.format(file_path))
with open(file_path) as fin:
fin.seek(0, 2)
max_offset_len = fin.tell()
max_spec_no = pymzml.run.Reader(file_path).get_spectrum_count() + 10
index_gzip(
file_path,
out_path,
max_idx=max_spec_no,
idx_len=len(str(max_offset_len))
)
print('Wrote file {0}'.format(out_path))
return
if __name__ == '__main__':
if len(sys.argv) < 2:
print(main.__doc__)
exit()
else:
main(*sys.argv[1:])