Source code for pacbio_data_processing.ipd

#######################################################################
#
# Copyright (C) 2021 David Palao
#
# This file is part of PacBio data processing.
#
#  PacBioDataProcessing is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  PacBio data processing is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################

import subprocess
import functools
import logging
from concurrent import futures
from typing import Union, Optional
from collections.abc import Generator
from pathlib import Path

from .bam_utils import WorkUnitGenerator, MoleculeWorkUnit
from .constants import GFF_SUF


[docs]class UnknownErrorIpdSummary(Exception): ...
[docs]def ipd_summary( molecule: MoleculeWorkUnit, fasta: Union[str, Path], program: Union[str, Path], nprocs: int, mod_types_comma_sep: str, ipd_model: Union[str, Path], skip_if_present: bool ) -> MoleculeWorkUnit: """Lowest level interface to ``ipdSummary``: all calls to that program are expected to be done through this function. It runs ``ipdSummary`` with an input bam file like this:: ipdSummary blasr.pMA683.subreads.bam --reference pMA683.fa\ --identify m6A --gff blasr.pMA683.subreads.476.bam.gff As a result of this, a gff file is created. This function sets an attribute in the target Molecule with the path to that file. TBD --- Missing features: * skip_if_present * logging * error handling * check output and raise error if != 0 """ molecule_id, molecule = molecule bam = molecule.src_bam_path output = bam.with_suffix(GFF_SUF) cmd = ( program, bam, "--reference", fasta, "--identify", mod_types_comma_sep, "--numWorkers", str(nprocs), "--gff", output ) if ipd_model: cmd = cmd + ("--ipdModel", ipd_model) subprocess.run(cmd) # , check=True) molecule.gff_path = output return (molecule_id, molecule)
[docs]def old_ipd_summary( molecule, fasta, program, nprocs, mod_types_comma_sep, ipd_model, skip_if_present): out_name = molecule.with_suffix(molecule.suffix+GFF_SUF) exe = ( program, molecule, "--reference", fasta, "--identify", mod_types_comma_sep, "--numWorkers", nprocs, "--gff", out_name ) if ipd_model: exe = exe + ("--ipdModel", ipd_model) if skip_if_present and out_name.is_file(): logging.debug(f"Modification file '{out_name}' already present!") else: while True: try: proc = subprocess.run(exe) if proc.returncode != 0: raise UnknownErrorIpdSummary( f"returncode: {proc.returncode}") # time.sleep(0.1) # logging.debug(( # (program, molecule, "--reference", fasta, "--identify", # mod_types_comma_sep, "--numWorkers", nprocs, # "--gff", str(out_name)) # )) except Exception as e: msg = str(e) logging.error(f"[{program}][{out_name}] {msg}") else: logging.info(f"Modification file '{out_name}' generated") break return out_name
[docs]def multi_ipd_summary_direct( molecules: WorkUnitGenerator, fasta: Union[str, Path], program: Union[str, Path], num_ipds: int, nprocs_per_ipd: int, modification_types: str, ipd_model: Optional[str]=None, skip_if_present: bool=False ) -> Generator[Path, None, None]: """Generator that yields gff files as they are produced. Serial implementation (one file produced after the other). """ mod_types_comma_sep = ",".join(modification_types) for molecule in molecules: yield ipd_summary( molecule, fasta=fasta, program=program, nprocs=nprocs_per_ipd, mod_types_comma_sep=mod_types_comma_sep, ipd_model=ipd_model, skip_if_present=skip_if_present )
[docs]def multi_ipd_summary_threads( molecules: WorkUnitGenerator, fasta: Union[str, Path], program: Union[str, Path], num_ipds: int, nprocs_per_ipd: int, modification_types: str, ipd_model: Optional[str]=None, skip_if_present: bool=False ) -> Generator[Path, None, None]: """Generator that yields gff files as they are produced in parallel. Implementation drived by a pool of threads. """ mod_types_comma_sep = ",".join(modification_types) partial_ipd_summary = functools.partial( ipd_summary, fasta=fasta, program=program, nprocs=nprocs_per_ipd, mod_types_comma_sep=mod_types_comma_sep, ipd_model=ipd_model, skip_if_present=skip_if_present, ) exe = futures.ThreadPoolExecutor(max_workers=num_ipds) yield from exe.map(partial_ipd_summary, molecules)
multi_ipd_summary = multi_ipd_summary_threads