Coverage for src/html2json/html2json.py: 0%
46 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-25 18:24 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-25 18:24 -0700
1import re
2from re import Match, Pattern
3from typing import Any, cast
5from pyquery import PyQuery
7Template = dict[str, Any]
8Data = dict[str, Any]
10CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501
13def __extract(
14 root: PyQuery,
15 selector: str | None,
16 prop: str | None,
17 cleaners: list[str],
18) -> str | None:
19 try:
20 tag = root.find(selector) if selector else root
21 # Non-matching selector
22 if not tag:
23 return None
24 except: # noqa: E722
25 # Invalid selector
26 return None
28 v: str
29 if prop:
30 v = str(tag.attr(prop))
31 else:
32 v = ''.join(c for c in tag.contents() if isinstance(c, str))
33 if not v:
34 v = str(tag.text())
35 v = v.strip()
37 for c in cleaners:
38 m: Match = cast("Match", CLEANER_REGEX.match(c))
40 v = (
41 re.sub(m.group("search"), m.group("sub"), v, count=(0 if m.group("flag") == "g" else 1))
42 if m.group("mode") == "s"
43 else cast("Match", re.search(m.group("search"), v)).group(0)
44 )
46 return v
49def collect(html: str, template: Template) -> Data:
50 def collect_rec(root: PyQuery, template: Template, data: Data) -> None:
51 for (t, s) in template.items():
52 if isinstance(s, dict):
53 data[t] = {}
54 collect_rec(root, s, data[t])
55 elif isinstance(s, list):
56 if len(s) == 1 and isinstance(s[0], list):
57 sub_selector, sub_template = s[0]
59 data[t] = []
60 for sub_root in root.find(sub_selector):
61 data[t].append({})
62 collect_rec(sub_root, sub_template, data[t][-1])
63 elif len(s) == 2 and isinstance(s[1], dict):
64 sub_selector, sub_template = s[0], s[1]
66 data[t] = {}
67 collect_rec(root.find(sub_selector), sub_template, data[t])
68 elif len(s) == 3:
69 data[t] = __extract(root, *s)
71 data: Data = {}
72 collect_rec(PyQuery(html), template, data)
74 return data