Coverage for src/html2json/__init__.py: 97%

76 statements  

« prev     ^ index     » next       coverage.py v7.8.1, created at 2025-05-27 21:19 -0700

1from __future__ import annotations 

2 

3import json 

4import re 

5from re import Match, Pattern 

6from typing import Any, cast 

7 

8from pyquery import PyQuery 

9 

10Template = dict[str, Any] 

11Data = dict[str, Any] 

12 

13__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501 

14 

15 

16__TEXT_NODES_SELECTOR_REGEX: Pattern = re.compile(r"(?P<selector>.+ +)?::text") 

17 

18__NEW_LINE_WHITESPACES_REGEX: Pattern = re.compile(r"\s*\n+\s*") 

19__LEADING_TRAILING_WHITESPACES_REGEX: Pattern = re.compile(r"^[^\S\n]+|[^\S\n]+$") 

20 

21 

22def __get_tags( 

23 root: PyQuery, 

24 selector: str | None = None, 

25) -> PyQuery | None: 

26 try: 

27 tags: PyQuery = root.find(selector) if selector else root 

28 # Non-matching selector 

29 if len(tags) == 0: 

30 return None 

31 except: # noqa: E722 

32 # Invalid selector 

33 return None 

34 

35 return tags 

36 

37 

38def __clean( 

39 v: str, 

40 cleaners: list[str] | None = None, 

41) -> str: 

42 for c in cleaners or []: 

43 m: Match = cast("Match", __CLEANER_REGEX.match(c)) 

44 

45 v = ( 

46 re.sub( 

47 m.group("search"), 

48 m.group("sub"), 

49 v, 

50 count=(0 if m.group("flag") == "g" else 1), 

51 ) if m.group("mode") == "s" 

52 else cast("Match", re.search(m.group("search"), v)).group(0) 

53 ) 

54 

55 return v 

56 

57 

58def __extract_text_nodes( 

59 root: PyQuery, 

60 selector: str | None = None, 

61 cleaners: list[str] | None = None, 

62) -> str | list[str] | None: 

63 tags: PyQuery | None = __get_tags(root, selector) 

64 if not tags: 

65 return None 

66 

67 results: list[str] = [] 

68 

69 # Must use `.items()` which returns `PyQuery` objects 

70 for tag in tags.items(): 

71 results.extend([ 

72 __clean( 

73 __LEADING_TRAILING_WHITESPACES_REGEX.sub( 

74 r" ", 

75 __NEW_LINE_WHITESPACES_REGEX.sub(r"\n", e), 

76 ), 

77 cleaners, 

78 ) 

79 for e in tag.contents() 

80 if isinstance(e, str) 

81 ]) 

82 

83 return results if len(results) > 1 else results[0] 

84 

85 

86def __extract( 

87 root: PyQuery, 

88 selector: str | None = None, 

89 prop: str | None = None, 

90 cleaners: list[str] | None = None, 

91) -> str | list[str] | None: 

92 # CSS standard does not support text node yet 

93 # https://github.com/w3c/csswg-drafts/issues/2208 

94 # Ideally, we should customize `cssselect` to add support for this new pseudo-class 

95 # https://cssselect.readthedocs.io/en/latest/#customizing-the-translation 

96 if selector and ( 

97 text_nodes_selector_match := __TEXT_NODES_SELECTOR_REGEX.fullmatch(selector) 

98 ): 

99 if prop: 

100 return None 

101 

102 return __extract_text_nodes( 

103 root, 

104 text_nodes_selector_match.group("selector").strip(), 

105 cleaners, 

106 ) 

107 

108 tags: PyQuery | None = __get_tags(root, selector) 

109 if tags is None: 

110 return None 

111 

112 results: list[str] = [] 

113 

114 # Must use `.items()` which returns `PyQuery` objects 

115 for tag in tags.items(): 

116 v: str = str( 

117 tag.attr(prop) if prop 

118 else tag.text(), 

119 ).strip() 

120 

121 results.append(__clean(v, cleaners)) 

122 

123 return results if len(results) > 1 else results[0] 

124 

125 

126def __collect_keys(root: PyQuery, key_template: str) -> list[str]: 

127 if key_template[0] == '[' and key_template[-1] == "]": 

128 keys: str | list[str] = __extract(root, *json.loads(key_template)) or [] 

129 return keys if isinstance(keys, list) else [keys] 

130 

131 return [key_template] 

132 

133 

134def __expand_template(root: PyQuery, template: Template) -> Template: 

135 return { 

136 key: value 

137 for key_template, value in template.items() 

138 for key in __collect_keys(root, key_template) 

139 } 

140 

141 

142def collect(html: str, template: Template) -> Data: 

143 def collect_rec(root: PyQuery, template: Template, data: Data) -> None: 

144 for (t, s) in __expand_template(root, template).items(): 

145 if isinstance(s, dict): 

146 data[t] = {} 

147 collect_rec(root, s, data[t]) 

148 elif isinstance(s, list): 

149 if len(s) == 1 and isinstance(s[0], list): 

150 sub_selector, sub_template = s[0] 

151 sub_selector = sub_selector.format(key=t) if sub_selector else None 

152 

153 data[t] = [] 

154 # Must use `.items()` which returns `PyQuery` objects 

155 for sub_root in root.find(sub_selector).items(): 

156 data[t].append({}) 

157 collect_rec(sub_root, sub_template, data[t][-1]) 

158 elif len(s) == 2 and isinstance(s[1], dict): 

159 sub_selector, sub_template = s[0], s[1] 

160 sub_selector = sub_selector.format(key=t) if sub_selector else None 

161 

162 data[t] = {} 

163 collect_rec(root.find(sub_selector), sub_template, data[t]) 

164 else: 

165 data[t] = ( 

166 __extract(root, s[0].format(key=t) if s[0] else None, *s[1:]) if s 

167 else __extract(root) 

168 ) 

169 

170 data: Data = {} 

171 collect_rec(PyQuery(html), template, data) 

172 

173 return data