Source code for pacbio_data_processing.ipd

#######################################################################
#
# Copyright (C) 2021, 2022 David Palao
#
# This file is part of PacBio data processing.
#
#  PacBioDataProcessing is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  PacBio data processing is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################

import subprocess
import functools
import logging
from concurrent import futures
from typing import Union, Optional
from collections.abc import Generator
from pathlib import Path

from .bam_utils import WorkUnitGenerator, MoleculeWorkUnit
from .constants import (
    GFF_SUF, DEFAULT_IPDSUMMARY_PROGRAM, HOWTO_INSTALL_IPDSUMMARY
)


[docs]class UnknownErrorIpdSummary(Exception): ...
[docs]class MissingIpdSummaryError(FileNotFoundError): def __str__(self): msg = super().__str__() msg += "\nIt can be installed with:\n" msg += HOWTO_INSTALL_IPDSUMMARY return msg
[docs]def ipd_summary( molecule: MoleculeWorkUnit, fasta: Union[str, Path], program: Path, nprocs: int, mod_types_comma_sep: str, ipd_model: Union[str, Path], skip_if_present: bool ) -> Optional[MoleculeWorkUnit]: """Lowest level interface to ``ipdSummary``: all calls to that program are expected to be done through this function. It runs ``ipdSummary`` with an input bam file like this:: ipdSummary blasr.pMA683.subreads.bam --reference pMA683.fa\ --identify m6A --gff blasr.pMA683.subreads.476.bam.gff As a result of this, a gff file is created. This function sets an attribute in the target Molecule with the path to that file. If the process went well (``ipdSummary`` returns ``0``), the input ``MoleculeWorkUnit`` is returned, otherwise the molecule is tagged as being problematic (``had_processing_problems`` is set to ``True``) and ``None`` is returned. Missing features: * skip_if_present """ molecule_id, molecule = molecule bam = molecule.src_bam_path output = bam.with_suffix(GFF_SUF) cmd = ( str(program), bam, "--reference", fasta, "--identify", mod_types_comma_sep, "--numWorkers", str(nprocs), "--gff", output ) if ipd_model: cmd = cmd + ("--ipdModel", ipd_model) # if skip_if_present and output.is_file(): # logging.debug(f"[{program.name}] Modification file '{output}' already present!") try: process_res = subprocess.run(cmd, capture_output=True) except FileNotFoundError as e: if Path(e.filename).name == DEFAULT_IPDSUMMARY_PROGRAM: raise MissingIpdSummaryError(*e.args, e.filename) else: raise if process_res.returncode == 0: expected_call = " ".join(str(_) for _ in cmd) molecule.gff_path = output out_msg = process_res.stdout.decode() logging.debug( f"[{program.name}] Called as follows:") logging.debug(f"[{program.name}] '{expected_call}'") logging.debug(f"[{program.name}] Output:") logging.debug(f"[{program.name}] {out_msg}") return (molecule_id, molecule) else: molecule.had_processing_problems = True logging.error( f"[{program.name}] Molecule {molecule_id} could not be processed" ) err_msg = process_res.stderr.decode().strip() logging.debug(f"[{program.name}] The reported error was:") logging.debug(f"[{program.name}] '{err_msg}'")
[docs]def multi_ipd_summary_direct( molecules: WorkUnitGenerator, fasta: Union[str, Path], program: Union[str, Path], num_ipds: int, nprocs_per_ipd: int, modification_types: str, ipd_model: Optional[str] = None, skip_if_present: bool = False ) -> Generator[WorkUnitGenerator, None, None]: """Generator that yields ``MoleculeWorkUnit`` resulting from ``ipd_summary`` (``None`` results are skipped). Serial implementation (one file produced after the other). """ mod_types_comma_sep = ",".join(modification_types) for molecule in molecules: ipd_result = ipd_summary( molecule, fasta=fasta, program=program, nprocs=nprocs_per_ipd, mod_types_comma_sep=mod_types_comma_sep, ipd_model=ipd_model, skip_if_present=skip_if_present ) if ipd_result is not None: yield ipd_result
[docs]def multi_ipd_summary_threads( molecules: WorkUnitGenerator, fasta: Union[str, Path], program: Union[str, Path], num_ipds: int, nprocs_per_ipd: int, modification_types: str, ipd_model: Optional[str] = None, skip_if_present: bool = False ) -> Generator[WorkUnitGenerator, None, None]: """Generator that yields ``MoleculeWorkUnit`` resulting from ``ipd_summary`` (``None`` results are skipped). Parallel implementation driven by a pool of threads. """ mod_types_comma_sep = ",".join(modification_types) partial_ipd_summary = functools.partial( ipd_summary, fasta=fasta, program=program, nprocs=nprocs_per_ipd, mod_types_comma_sep=mod_types_comma_sep, ipd_model=ipd_model, skip_if_present=skip_if_present, ) exe = futures.ThreadPoolExecutor(max_workers=num_ipds) for ipd_result in exe.map(partial_ipd_summary, molecules): if ipd_result is not None: yield ipd_result
multi_ipd_summary = multi_ipd_summary_threads