Source code for pymzml.file_classes.standardMzml

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Interface for uncompressed mzML files.

@author: Manuel Koesters
"""

import bisect
import codecs
import re
from xml.etree.ElementTree import XML, iterparse

from .. import spec
from .. import regex_patterns


[docs]class StandardMzml(object): """ """
[docs] def __init__(self, path, encoding, build_index_from_scratch=False): """ Initalize Wrapper object for standard mzML files. Arguments: path (str) : path to the file encoding (str) : encoding of the file """ self.path = path self.file_handler = codecs.open( path, mode = 'r', encoding = encoding ) self.offset_dict = dict() self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN if build_index_from_scratch is True: seeker = open(path, 'rb') self._build_index_from_scratch(seeker) seeker.close()
[docs] def __getitem__(self, identifier): """ Access the item with id 'identifier'. Either use linear, binary or interpolated search. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ ############################################################################# # DOES NOT HOLD IF NUMBERS DONT START WITH ONE AND/OR DONT INCREASE BY ONE # # TODO FIXME # ############################################################################# self.file_handler.seek(0) spectrum = None if str(identifier).upper() == 'TIC': # print(str(identifier).upper()) found = False mzmliter = iter(iterparse(self.file_handler, events=['end'])) while found is False: event, element = next(mzmliter, ('STOP', 'STOP')) if event == 'end': if element.tag.endswith('}chromatogram'): if element.get('id') == 'TIC': found = True spectrum = spec.Chromatogram( element, measured_precision = 5e-6 ) elif event == 'STOP': raise StopIteration elif identifier in self.offset_dict: start = self.offset_dict[identifier] with open(self.path, 'rb') as seeker: seeker.seek(start[0]) start, end = self._read_to_spec_end(seeker) self.file_handler.seek(start, 0) data = self.file_handler.read(end - start) if data.startswith('<spectrum'): spectrum = spec.Spectrum( XML(data), measured_precision = 5e-6 ) elif data.startswith('<chromatogram'): spectrum = spec.Chromatogram( XML(data) ) elif type(identifier) == str: return self._search_string_identifier( identifier ) else: spectrum = self._interpol_search(identifier) return spectrum
[docs] def _build_index(self, from_scratch=False): """ Build an index. A list of offsets to which a file pointer can seek directly to access a particular spectrum or chromatogram without parsing the entire file. Args: from_scratch(bool): Whether or not to force building the index from scratch, by parsing the file, if no existing index can be found. Returns: A file-like object used to access the indexed content by seeking to a particular offset for the file. """ # Declare the pre-seeker seeker = open(self.path, 'rb') # Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos index_list_offset_pattern = re.compile( b'<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>' ) chromatogram_offset_pattern = re.compile( b'(?P<WTF>[nativeID|idRef])="TIC">(?P<offset>[0-9]*)</offset' ) self.offset_dict['TIC'] = None seeker.seek(0, 2) index_found = False spectrum_index_pattern = regex_patterns.SPECTRUM_INDEX_PATTERN for _ in range(1, 10): # max 10kbyte # some converters fail in writing a correct index # we found # a) the offset is always the same (silent fail hurray!) sanity_check_set = set() try: seeker.seek(-1024 * _, 1) except: break # File is smaller than 10kbytes ... for line in seeker: match = chromatogram_offset_pattern.search(line) if match: self.offset_dict['TIC'] = int( bytes.decode(match.group('offset')) ) match_spec = spectrum_index_pattern.search(line) if match_spec is not None: spec_byte_offset = int( bytes.decode(match_spec.group('offset')) ) sanity_check_set.add(spec_byte_offset) match = index_list_offset_pattern.search(line) if match: index_found = True # print(int(match.group('indexListOffset').decode('utf-8'))) # print(line) # exit(1) index_list_offset = int( match.group('indexListOffset').decode('utf-8') ) # break if index_found is True and \ self.offset_dict['TIC'] is not None: break if index_found is True: # Jumping to index list and slurpin all specOffsets seeker.seek(index_list_offset, 0) spectrum_index_pattern = regex_patterns.SPECTRUM_INDEX_PATTERN sim_index_pattern = regex_patterns.SIM_INDEX_PATTERN for line in seeker: match_spec = spectrum_index_pattern.search(line) if match_spec and match_spec.group('nativeID') == b'': match_spec = None match_sim = sim_index_pattern.search(line) if match_spec: offset = int(bytes.decode(match_spec.group('offset'))) native_id = int(bytes.decode(match_spec.group('nativeID'))) self.offset_dict[native_id] = (offset) elif match_sim: offset = int(bytes.decode(match_sim.group('offset'))) native_id = bytes.decode(match_sim.group('nativeID')) # if native_id == 'DECOY_126104_C[160]NVVISGGTGSGK/2_y10': try: native_id = int(regex_patterns.SPECTRUM_ID_PATTERN.search( native_id ).group(1)) # exit(1) except AttributeError: # match is None and has no attribute group, # so use the whole string as ID pass self.offset_dict[native_id] = (offset,) seeker.close()
[docs] def _build_index_from_scratch(self, seeker): """Build an index of spectra/chromatogram data with offsets by parsing the file.""" def get_data_indices(fh, chunksize=8192, lookback_size=100): """Get a dictionary with binary file indices of spectra and chromatograms in an mzML file. Will parse quickly through the file and find all occurences of <chromatogram ... id="..." and <spectrum ... id="..." using a regex. We dont use an XML parser here because we need to know the exact location of the filepointer which is usually not possible with common xml parsers. """ chrom_positions = {} spec_positions = {} chromcnt = 0 speccnt = 0 # regexes to be used chromexp = re.compile(b"<\s*chromatogram[^>]*id=\"([^\"]*)\"") chromcntexp = re.compile(b"<\s*chromatogramList\s*count=\"([^\"]*)\"") specexp = re.compile(b"<\s*spectrum[^>]*id=\"([^\"]*)\"") speccntexp = re.compile(b"<\s*spectrumList\s*count=\"([^\"]*)\"") # go to start of file fh.seek(0) prev_chunk = "" while True: # read a chunk of data offset = fh.tell() chunk = fh.read(chunksize) if not chunk: break # append a part of the previous chunk since we have cut in the middle # of the text (to make sure we dont miss anything, prev_chunk # is analyzed twice). if len(prev_chunk) > 0: chunk = prev_chunk[-lookback_size:] + chunk offset -= lookback_size prev_chunk = chunk # find all occurences of the expressions and add to the dictionary for m in chromexp.finditer(chunk): chrom_positions[m.group(1).decode('utf-8')] = offset + m.start() for m in specexp.finditer(chunk): spec_positions[m.group(1).decode('utf-8')] = offset + m.start() # also look for the total count of chromatograms and spectra # -> must be the same as the content of our dict! m = chromcntexp.search(chunk) if m is not None: chromcnt = int(m.group(1)) m = speccntexp.search(chunk) if m is not None: speccnt = int(m.group(1)) # Check if everything is ok (e.g. we found the right number of # chromatograms and spectra) and then return the dictionary. if (chromcnt == len(chrom_positions) and speccnt == len(spec_positions)): positions = {} positions.update(chrom_positions) positions.update(spec_positions) else: print( '[ Warning ] Found {spec_count} spectra ' 'and {chrom_count} chromatograms\n' '[ Warning ] However Spectrum index list shows {speccnt} and ' 'Chromatogram index list shows {chromcnt} entries'.format( spec_count=len(spec_positions), chrom_count=len(chrom_positions), speccnt=speccnt, chromcnt=chromcnt ) ) print( '[ Warning ] Updating offset dict with found offsets ' 'but some might be still missing\n' '[ Warning ] This may happen because your is file truncated' ) positions = {} positions.update(chrom_positions) positions.update(spec_positions) return positions indices = get_data_indices(seeker) if indices is not None: tmp_dict = {} item_list = sorted( list(indices.items()), key=lambda x: x[1] ) for i in range(len(item_list)): key = item_list[i][0] tmp_dict[key] = (item_list[i][1], ) self.offset_dict.update(tmp_dict) # make sure the list is sorted (for bisect) # self.info['offsetList'] = sorted(self.info['offsetList']) # self.info['seekable'] = True return
def _read_to_spec_end(self, seeker, chunks_to_read=8): """ Read from current seeker position to the end of the next spectrum tag and return start and end postition Args: seeker (_io.BufferedReader): Reader instance used in calling function Returns: positions (tuple): tuple with start and end postion of the spectrum """ # start_pos = seeker.tell() chunk_size = 512 * chunks_to_read end_found = False start_pos = seeker.tell() data_chunk = seeker.read(chunk_size) while end_found is False: chunk_offset = seeker.tell() data_chunk = seeker.read(chunk_size) tag_end, seeker = self._read_until_tag_end(seeker) data_chunk += tag_end if regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk): match = regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk) relative_pos_in_chunk = match.end() end_pos = chunk_offset + relative_pos_in_chunk end_found = True elif regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk): match = regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search( data_chunk ) relative_pos_in_chunk = match.end() end_pos = chunk_offset + relative_pos_in_chunk end_found = True return (start_pos, end_pos)
[docs] def _search_linear(self, seeker, index, chunk_size=8): """ Fallback to linear search if interpolated search fails. """ data = None i = 0 total_chunk_size = chunk_size * 512 spec_start = None spec_end = None i = 0 # print('target', index) while True: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker) data += string spec_start = self.spec_open.search(data) if spec_start: spec_start_offset = file_pointer + spec_start.start() seeker.seek(spec_start_offset) current_index = int( re.search( b'[0-9]*$', spec_start.group('id') ).group() ) # print(current_index) spec_end = self.spec_close.search(data[spec_start.start():]) if spec_end: spec_end_offset = file_pointer + spec_end.end() + spec_start.start() seeker.seek(spec_end_offset) while spec_end is None: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker) data += string spec_end = self.spec_close.search(data) if spec_end: spec_end_offset = file_pointer + spec_end.end() self.offset_dict[current_index] = (spec_start_offset, spec_end_offset) seeker.seek(spec_end_offset) break if current_index == index: seeker.seek(spec_start_offset) spec_string = seeker.read( spec_end_offset - spec_start_offset ) self.offset_dict[current_index] = ( spec_start_offset, spec_end_offset ) xml_string = XML(spec_string) seeker.close() return spec.Spectrum( xml_string, measured_precision=5e-6 )
def _search_string_identifier(self, search_string, chunk_size=8): with open(self.path, 'rb') as seeker: data = None total_chunk_size = chunk_size * 512 spec_start = None # NOTE: This needs to go intp regex_patterns.py regex_string = re.compile( "<\s*spectrum[^>]*index=\"[0-9]+\"\sid=\"({0})\"\sdefaultArrayLength=\"[0-9]+\">".format( "".join( ['.*', search_string, '.*'] ) ).encode() ) search_string = search_string.encode() while True: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker, byte_mode=True) data += string spec_start = regex_string.search(data) chrom_start = regex_patterns.CHROMO_OPEN_PATTERN.search(data) if spec_start: spec_start_offset = file_pointer + spec_start.start() current_index = spec_start.group(1) if search_string in current_index: seeker.seek(spec_start_offset) start, end = self._read_to_spec_end(seeker) seeker.seek(start) spec_string = seeker.read(end-start) xml_string = XML(spec_string) return spec.Spectrum( xml_string, measured_precision=5e-6 ) elif chrom_start: chrom_start_offset = file_pointer + chrom_start.start() if search_string == chrom_start.group(1): seeker.seek(chrom_start_offset) start, end = self._read_to_spec_end(seeker) seeker.seek(start) chrom_string = seeker.read(end-start) xml_string = XML(chrom_string) return spec.Chromatogram(xml_string) elif len(data) == 0: raise Exception('cant find specified string') def _read_until_tag_end(self, seeker, max_search_len=12, byte_mode=False): """ Help make sure no splitted text appear in chunked data, so regex always find <spectrum ...> and </spectrum> """ count = 0 string = b'' curr_byte = '' while count < max_search_len and curr_byte != b'>' and curr_byte != b'<' and curr_byte != b' ': curr_byte = seeker.read(1) string += curr_byte count += 1 return string, seeker
[docs] def read(self, size=-1): """ Read binary data from file handler. Keyword Arguments: size (int): Number of bytes to read from file, -1 to read to end of file Returns: data (str): byte string of len size of input data """ return self.file_handler.read(size)
def close(self): """ """ self.file_handler.close()
if __name__ == '__main__': print(__doc__)