Coverage for src/extratools_core/seqtools.py: 0%

67 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-04 05:36 -0700

1import math 

2from collections.abc import Callable, Iterable, Sequence 

3from functools import cache 

4from itertools import chain, repeat 

5 

6from toolz.itertoolz import count, sliding_window 

7 

8from .common import iter_to_seq 

9from .typing import Comparable 

10 

11 

12def best_subseq[T]( 

13 a: Iterable[T], 

14 key: Callable[[Iterable[T]], Comparable], 

15) -> Iterable[T]: 

16 s: Sequence = iter_to_seq(a) 

17 

18 return max( 

19 chain([[]], ( 

20 s[i:j] 

21 for i in range(len(s)) 

22 for j in range(i + 1, len(s) + 1) 

23 )), 

24 key=key, 

25 ) 

26 

27 

28def common_subseq[T](a: Iterable[T], b: Iterable[T]) -> Iterable[T]: 

29 @cache 

30 # Find the start pos in list `a` 

31 def align_rec(alen: int, blen: int) -> int: 

32 if alen == 0 or blen == 0 or aseq[alen - 1] != bseq[blen - 1]: 

33 return alen 

34 

35 return align_rec(alen - 1, blen - 1) 

36 

37 aseq: Sequence[T] = iter_to_seq(a) 

38 bseq: Sequence[T] = iter_to_seq(b) 

39 

40 for k in range(*max( 

41 ( 

42 (align_rec(i, j), i) 

43 for i in range(len(aseq) + 1) 

44 for j in range(len(bseq) + 1) 

45 ), 

46 key=lambda x: x[1] - x[0], 

47 )): 

48 yield aseq[k] 

49 

50 

51def is_subseq[T](a: Iterable[T], b: Iterable[T]) -> bool: 

52 aseq: Sequence[T] = iter_to_seq(a) 

53 return count(common_subseq(aseq, b)) == len(aseq) 

54 

55 

56def best_subseq_with_gaps[T]( 

57 a: Iterable[T], 

58 key: Callable[[Iterable[T]], Comparable], 

59) -> Iterable[T]: 

60 def find(alen: int) -> tuple[Comparable, list[T]]: 

61 if alen == 0: 

62 return (key([]), []) 

63 

64 prevcost: Comparable 

65 prevseq: list[T] 

66 prevcost, prevseq = find(alen - 1) 

67 

68 currseq: list[T] = [*prevseq, b[alen - 1]] 

69 

70 return max( 

71 (prevcost, prevseq), 

72 (key(currseq), currseq), 

73 key=lambda x: x[0], 

74 ) 

75 

76 b: Sequence[T] = iter_to_seq(a) 

77 return find(len(b))[1] 

78 

79 

80def common_subseq_with_gaps[T](a: Iterable[T], b: Iterable[T]) -> Iterable[T]: 

81 alignment: tuple[float, tuple[Iterable[T | None], Iterable[T | None]]] | None = align(a, b) 

82 if alignment is None: 

83 # Alignment cannot be `None` as we do not have cost bound 

84 raise RuntimeError 

85 

86 return ( 

87 x 

88 for x, y in zip( 

89 *(alignment[1]), 

90 strict=False, 

91 ) 

92 # Actually x and y cannot both be None 

93 if x is not None and x == y 

94 ) 

95 

96 

97def is_subseq_with_gaps[T](a: Iterable[T], b: Iterable[T]) -> bool: 

98 aseq: Sequence[T] = iter_to_seq(a) 

99 return count(common_subseq_with_gaps(aseq, b)) == len(aseq) 

100 

101 

102def align[T]( 

103 a: Iterable[T], 

104 b: Iterable[T], 

105 *, 

106 cost: Callable[[T, T], float] | None = None, 

107 bound: float = math.inf, 

108 default: T = None, 

109) -> tuple[float, tuple[Iterable[T | None], Iterable[T | None]]] | None: 

110 def merge( 

111 prev: tuple[float, tuple[Sequence[T | None], Sequence[T | None]]] | None, 

112 curr: tuple[T, T], 

113 ) -> tuple[float, tuple[Sequence[T | None], Sequence[T | None]]] | None: 

114 if prev is None: 

115 return None 

116 

117 prevcost: float 

118 l: Sequence[T | None] 

119 r: Sequence[T | None] 

120 prevcost, (l, r) = prev 

121 x: T 

122 y: T 

123 x, y = curr 

124 

125 currcost: float = prevcost + costfunc(x, y) 

126 if currcost > bound: 

127 return None 

128 

129 return currcost, ([*l, x], [*r, y]) 

130 

131 @cache 

132 def align_rec(alen: int, blen: int) -> tuple[ 

133 float, 

134 tuple[Sequence[T | None], Sequence[T | None]], 

135 ] | None: 

136 if alen == 0 or blen == 0: 

137 res: tuple[Sequence[T], Sequence[T]] = ( 

138 [default] * blen, bseq[:blen], 

139 ) if alen == 0 else ( 

140 aseq[:alen], [default] * alen, 

141 ) 

142 

143 return ( 

144 sum(costfunc(x, y) for x, y in zip(*res, strict=False)), 

145 res, 

146 ) 

147 

148 return min( 

149 ( 

150 merge(align_rec(alen - 1, blen), (aseq[alen - 1], default)), 

151 merge(align_rec(alen, blen - 1), (default, bseq[blen - 1])), 

152 merge(align_rec(alen - 1, blen - 1), (aseq[alen - 1], bseq[blen - 1])), 

153 ), 

154 key=lambda x: x[0] if x else math.inf, 

155 default=None, 

156 ) 

157 

158 aseq: Sequence[T] = iter_to_seq(a) 

159 bseq: Sequence[T] = iter_to_seq(b) 

160 

161 if cost is None: 

162 costfunc: Callable[[T, T], float] = lambda x, y: 0 if x == y else 1 

163 

164 return align_rec(len(aseq), len(bseq)) 

165 

166 

167def sorted_by_rank[T]( 

168 data: Iterable[T], 

169 ranks: Iterable[Comparable], 

170 *, 

171 _reverse: bool = False, 

172) -> Iterable[T]: 

173 return [ 

174 v for _, v in sorted( 

175 zip(ranks, data, strict=True), 

176 key=lambda x: x[0], 

177 reverse=_reverse, 

178 ) 

179 ] 

180 

181 

182def seq_to_grams[T]( 

183 seq: Iterable[T], 

184 n: int, 

185 pad: T | None = None, 

186) -> Iterable[Iterable[T]]: 

187 if pad is not None: 

188 seq = chain(repeat(pad, n - 1), seq, repeat(pad, n - 1)) 

189 

190 return sliding_window(n, seq)