Coverage for src/html2json/__init__.py: 97%
76 statements
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-27 21:19 -0700
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-27 21:19 -0700
1from __future__ import annotations
3import json
4import re
5from re import Match, Pattern
6from typing import Any, cast
8from pyquery import PyQuery
10Template = dict[str, Any]
11Data = dict[str, Any]
13__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501
16__TEXT_NODES_SELECTOR_REGEX: Pattern = re.compile(r"(?P<selector>.+ +)?::text")
18__NEW_LINE_WHITESPACES_REGEX: Pattern = re.compile(r"\s*\n+\s*")
19__LEADING_TRAILING_WHITESPACES_REGEX: Pattern = re.compile(r"^[^\S\n]+|[^\S\n]+$")
22def __get_tags(
23 root: PyQuery,
24 selector: str | None = None,
25) -> PyQuery | None:
26 try:
27 tags: PyQuery = root.find(selector) if selector else root
28 # Non-matching selector
29 if len(tags) == 0:
30 return None
31 except: # noqa: E722
32 # Invalid selector
33 return None
35 return tags
38def __clean(
39 v: str,
40 cleaners: list[str] | None = None,
41) -> str:
42 for c in cleaners or []:
43 m: Match = cast("Match", __CLEANER_REGEX.match(c))
45 v = (
46 re.sub(
47 m.group("search"),
48 m.group("sub"),
49 v,
50 count=(0 if m.group("flag") == "g" else 1),
51 ) if m.group("mode") == "s"
52 else cast("Match", re.search(m.group("search"), v)).group(0)
53 )
55 return v
58def __extract_text_nodes(
59 root: PyQuery,
60 selector: str | None = None,
61 cleaners: list[str] | None = None,
62) -> str | list[str] | None:
63 tags: PyQuery | None = __get_tags(root, selector)
64 if not tags:
65 return None
67 results: list[str] = []
69 # Must use `.items()` which returns `PyQuery` objects
70 for tag in tags.items():
71 results.extend([
72 __clean(
73 __LEADING_TRAILING_WHITESPACES_REGEX.sub(
74 r" ",
75 __NEW_LINE_WHITESPACES_REGEX.sub(r"\n", e),
76 ),
77 cleaners,
78 )
79 for e in tag.contents()
80 if isinstance(e, str)
81 ])
83 return results if len(results) > 1 else results[0]
86def __extract(
87 root: PyQuery,
88 selector: str | None = None,
89 prop: str | None = None,
90 cleaners: list[str] | None = None,
91) -> str | list[str] | None:
92 # CSS standard does not support text node yet
93 # https://github.com/w3c/csswg-drafts/issues/2208
94 # Ideally, we should customize `cssselect` to add support for this new pseudo-class
95 # https://cssselect.readthedocs.io/en/latest/#customizing-the-translation
96 if selector and (
97 text_nodes_selector_match := __TEXT_NODES_SELECTOR_REGEX.fullmatch(selector)
98 ):
99 if prop:
100 return None
102 return __extract_text_nodes(
103 root,
104 text_nodes_selector_match.group("selector").strip(),
105 cleaners,
106 )
108 tags: PyQuery | None = __get_tags(root, selector)
109 if tags is None:
110 return None
112 results: list[str] = []
114 # Must use `.items()` which returns `PyQuery` objects
115 for tag in tags.items():
116 v: str = str(
117 tag.attr(prop) if prop
118 else tag.text(),
119 ).strip()
121 results.append(__clean(v, cleaners))
123 return results if len(results) > 1 else results[0]
126def __collect_keys(root: PyQuery, key_template: str) -> list[str]:
127 if key_template[0] == '[' and key_template[-1] == "]":
128 keys: str | list[str] = __extract(root, *json.loads(key_template)) or []
129 return keys if isinstance(keys, list) else [keys]
131 return [key_template]
134def __expand_template(root: PyQuery, template: Template) -> Template:
135 return {
136 key: value
137 for key_template, value in template.items()
138 for key in __collect_keys(root, key_template)
139 }
142def collect(html: str, template: Template) -> Data:
143 def collect_rec(root: PyQuery, template: Template, data: Data) -> None:
144 for (t, s) in __expand_template(root, template).items():
145 if isinstance(s, dict):
146 data[t] = {}
147 collect_rec(root, s, data[t])
148 elif isinstance(s, list):
149 if len(s) == 1 and isinstance(s[0], list):
150 sub_selector, sub_template = s[0]
151 sub_selector = sub_selector.format(key=t) if sub_selector else None
153 data[t] = []
154 # Must use `.items()` which returns `PyQuery` objects
155 for sub_root in root.find(sub_selector).items():
156 data[t].append({})
157 collect_rec(sub_root, sub_template, data[t][-1])
158 elif len(s) == 2 and isinstance(s[1], dict):
159 sub_selector, sub_template = s[0], s[1]
160 sub_selector = sub_selector.format(key=t) if sub_selector else None
162 data[t] = {}
163 collect_rec(root.find(sub_selector), sub_template, data[t])
164 else:
165 data[t] = (
166 __extract(root, s[0].format(key=t) if s[0] else None, *s[1:]) if s
167 else __extract(root)
168 )
170 data: Data = {}
171 collect_rec(PyQuery(html), template, data)
173 return data