Coverage for src/extratools_html/__init__.py: 50%
101 statements
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-29 18:41 -0700
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-29 18:41 -0700
1from __future__ import annotations
3import asyncio
4import ssl
5from collections.abc import Iterable
6from contextlib import suppress
7from datetime import UTC, datetime, timedelta
8from enum import StrEnum
9from http import HTTPStatus
10from typing import Any, cast
11from urllib.parse import urlparse
13import backoff
14import httpx
15import minify_html
16import truststore
17from blob_dict.blob import StrBlob
18from blob_dict.dict.path import LocalPath, PathBlobDict
19from extratools_core.typing import PathLike
20from html2text import HTML2Text
22with suppress(ImportError):
23 from playwright.async_api import Browser, async_playwright, expect
24 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
26from .cleanup import cleanup_page
28MAX_TRIES: int = 3
29MAX_TIMEOUT: int = 60
30REQUEST_TIMEOUT: int = 10
31# In milliseconds
32PRE_ACTION_TIMEOUT: int = 10 * 1_000
34ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
36# TODO: Make cache path/TTL configurable via configuration file (in TOML).
37# It will also allow support for other non-local path (like `CloudPath`).
38CACHE_PATH: PathLike = LocalPath("~/.http-cache").expanduser()
39CACHE_TTL: timedelta = timedelta(days=1)
41cache = PathBlobDict(CACHE_PATH, blob_class=StrBlob)
42cache.create()
45class PageElementAction(StrEnum):
46 CLICK = "click"
47 TO_BE_VISIBLE = "to_be_visible"
50async def __download_via_request(
51 page_url: str,
52 *,
53 user_agent: str | None = None,
54) -> str | None:
55 # https://www.python-httpx.org/advanced/ssl/
56 async with httpx.AsyncClient(verify=ctx) as client:
57 response: httpx.Response = await client.get(
58 page_url,
59 follow_redirects=True,
60 timeout=REQUEST_TIMEOUT,
61 headers=(
62 {
63 "User-Agent": user_agent,
64 } if user_agent
65 else {}
66 ),
67 )
69 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
70 # It also triggers backoff if necessary
71 return None
73 response.raise_for_status()
75 return response.text
78async def __download_via_browser(
79 page_url: str,
80 *,
81 user_agent: str | None = None,
82 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
83) -> str | None:
84 async with async_playwright() as playwright:
85 browser: Browser = await playwright.chromium.launch()
86 await browser.new_context(
87 user_agent=user_agent,
88 )
90 page = await browser.new_page()
91 await page.route(
92 "**/*",
93 lambda route: (
94 route.abort()
95 # https://playwright.dev/python/docs/api/class-request#request-resource-type
96 if route.request.resource_type in {
97 "font",
98 "image",
99 "media",
100 }
101 else route.continue_()
102 ),
103 )
104 response = await page.goto(page_url)
105 if not response:
106 return None
107 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
108 # It also triggers backoff if necessary
109 return None
111 for selector, action in pre_actions or []:
112 with suppress(AssertionError, PlaywrightTimeoutError):
113 match action:
114 case PageElementAction.CLICK:
115 await page.locator(selector).click(
116 timeout=PRE_ACTION_TIMEOUT,
117 # Allow click even current element is covered by other elements.
118 # Otherwise, other pre-actions are needed before this pre-action
119 # to dismiss those covering elements.
120 # However, it is possible that dismissing those covering elements
121 # is necessary logic for page to function properly.
122 force=True,
123 )
124 case PageElementAction.TO_BE_VISIBLE:
125 await expect(page.locator(selector)).to_be_visible(
126 timeout=PRE_ACTION_TIMEOUT,
127 )
129 html: str = await page.content()
131 await browser.close()
133 return html
136def get_cache_key(page_url: str) -> str:
137 parse_result = urlparse(page_url)
139 # Need to handle reserved characters for filename
140 # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words
141 dir: str = parse_result.netloc.replace(":", "_")
143 path: str = parse_result.path or "/"
144 # Add default filename
145 if path.endswith("/"):
146 path += "?"
148 if parse_result.query:
149 if not path.endswith("/?"):
150 path += "?"
151 path += parse_result.query
153 return dir + path
156def is_cache_expired(cache_key: str) -> bool:
157 path: PathLike = CACHE_PATH / cache_key
158 if not path.exists():
159 return True
161 return (
162 datetime.now(UTC)
163 - datetime.fromtimestamp(path.stat().st_mtime, UTC)
164 > CACHE_TTL
165 )
168@backoff.on_predicate(
169 backoff.expo,
170 max_tries=MAX_TRIES,
171 max_time=MAX_TIMEOUT,
172)
173async def download_page_async(
174 page_url: str,
175 *,
176 cleanup: bool = False,
177 text_only: bool = False,
178 minify: bool = True,
179 user_agent: str | None = None,
180 use_browser: bool = False,
181 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
182 use_cache: bool = True,
183) -> str | None:
184 page_html: str | None
185 cache_key: str = get_cache_key(page_url)
187 if use_cache and not is_cache_expired(cache_key):
188 page_html = cast("StrBlob", cache[cache_key]).as_str()
189 elif use_browser:
190 page_html = await __download_via_browser(
191 page_url,
192 user_agent=user_agent,
193 pre_actions=pre_actions,
194 )
195 else:
196 page_html = await __download_via_request(
197 page_url,
198 user_agent=user_agent,
199 )
200 if page_html is None:
201 return None
203 cache[cache_key] = StrBlob(page_html)
205 if minify:
206 page_html = minify_html.minify(page_html)
208 if cleanup:
209 page_html = await cleanup_page(page_html)
211 if text_only:
212 h = HTML2Text()
213 h.ignore_images = True
214 h.ignore_links = True
215 return h.handle(page_html)
217 return page_html
220def download_page(
221 page_url: str,
222 **kwargs: Any,
223) -> str | None:
224 return asyncio.run(download_page_async(
225 page_url,
226 **kwargs,
227 ))