Source code for pacbio_data_processing.cigar
#######################################################################
#
# Copyright (C) 2021 David Palao
#
# This file is part of PacBio data processing.
#
# PacBioDataProcessing is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PacBio data processing is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
"""This module provides basic '*re-invented*' functionality to handle
Cigars.
A Cigar describes the differences between two sequences by providing a
series of operations that one has to apply to one sequence to obtain the
other one. For instance, given these two sequences:
sequence 1 (e.g. from the refenrece)::
AAGTTCCGCAAATT
and
sequence 2 (e.g. from the aligner)::
AAGCTCCCGCAATT
The Cigar that brings us from sequence 1 to sequence 2 is::
3=1X3=1I4=1D2=
where the numbers refer to the amount of letters and the symbols'
meaning can be found in the table below. Therefore the Cigar in the
example is a shorthand for:
*3 equal bases followed by 1 replacement followed by 3 equal bases*
*followed by 1 insertion followed by 4 equal bases*
*followed by 1 deletion followed by 2 equal bases*
+--------+-------------+
| symbol | meaning |
+========+=============+
| = | equal |
+--------+-------------+
| I | insertion |
+--------+-------------+
| D | deletion |
+--------+-------------+
| X | replacement |
+--------+-------------+
| S | soft clip |
+--------+-------------+
| H | hard clip |
+--------+-------------+
"""
import re
_SYMBOLS = "=IDSXH"
[docs]class Cigar:
[docs] def __init__(self, incigar):
self._incigar = str(incigar)
def __len__(self):
return len(list(self.__iter__()))
def __iter__(self):
return (
_.group() for _ in re.finditer(rf"\d+[{_SYMBOLS}]", self._incigar))
def __repr__(self):
return "Cigar({})".format(repr(self._incigar))
def __eq__(self, other):
return self._incigar == other._incigar
@property
def number_pb_diffs(self):
diffs = 0
for item in self:
howmany = int(item[:-1])
sym = item[-1]
if sym != "=":
diffs += howmany
return diffs
@property
def number_diff_items(self):
num = 0
for item in self:
sym = item[-1]
if sym != "=":
num += 1
return num
@property
def number_diff_types(self):
return len({_[-1] for _ in self if _[-1] != "="})
@property
def number_pbs(self):
num = 0
for item in self:
sym = item[-1]
howmany = int(item[:-1])
if sym in "=XISH":
num += howmany
return num
@property
def diff_ratio(self):
"""difference ratio: ``1`` means that *each* base is different;
``0`` means that all the bases are equal.
"""
return self.number_pb_diffs/self.number_pbs
@property
def sim_ratio(self):
"""similarity ratio: ``1`` means that all the bases are equal;
``0`` means that *each* base is different.
This is computed from :py:meth:`diff_ratio`.
"""
return 1-self.diff_ratio