Coverage for src/extratools_html/__init__.py: 42%
67 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-03 21:00 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-03 21:00 -0700
1from __future__ import annotations
3import asyncio
4from collections.abc import Iterable
5from contextlib import suppress
6from enum import StrEnum
7from http import HTTPStatus
8from typing import Any
10import backoff
11import httpx
12import truststore
13from html2text import HTML2Text
15with suppress(ImportError):
16 from playwright.async_api import Browser, async_playwright, expect
17 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
19from .cleanup import cleanup_page
21truststore.inject_into_ssl()
23MAX_TRIES: int = 3
24MAX_TIMEOUT: int = 60
25REQUEST_TIMEOUT: int = 10
26# In milliseconds
27PRE_ACTION_TIMEOUT: int = 10 * 1_000
30class PageElementAction(StrEnum):
31 CLICK = "click"
32 TO_BE_VISIBLE = "to_be_visible"
35async def __download_via_request(
36 page_url: str,
37 *,
38 user_agent: str | None = None,
39) -> str | None:
40 async with httpx.AsyncClient() as client:
41 response: httpx.Response = await client.get(
42 page_url,
43 follow_redirects=True,
44 timeout=REQUEST_TIMEOUT,
45 headers=(
46 {
47 "User-Agent": user_agent,
48 } if user_agent
49 else {}
50 ),
51 )
53 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
54 # It also triggers backoff if necessary
55 return None
57 response.raise_for_status()
59 return response.text
62async def __download_via_browser(
63 page_url: str,
64 *,
65 user_agent: str | None = None,
66 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
67) -> str | None:
68 async with async_playwright() as playwright:
69 browser: Browser = await playwright.chromium.launch()
70 await browser.new_context(
71 user_agent=user_agent,
72 )
74 page = await browser.new_page()
75 response = await page.goto(page_url)
76 if not response:
77 return None
78 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
79 # It also triggers backoff if necessary
80 return None
82 for selector, action in pre_actions or []:
83 with suppress(AssertionError, PlaywrightTimeoutError):
84 match action:
85 case PageElementAction.CLICK:
86 await page.locator(selector).click(
87 timeout=PRE_ACTION_TIMEOUT,
88 # Allow click even current element is covered by other elements.
89 # Otherwise, other pre-actions are needed before this pre-action
90 # to dismiss those covering elements.
91 # However, it is possible that dismissing those covering elements
92 # is necessary logic for page to function properly.
93 force=True,
94 )
95 case PageElementAction.TO_BE_VISIBLE:
96 await expect(page.locator(selector)).to_be_visible(
97 timeout=PRE_ACTION_TIMEOUT,
98 )
100 html: str = await page.content()
102 await browser.close()
104 return html
107@backoff.on_predicate(
108 backoff.expo,
109 max_tries=MAX_TRIES,
110 max_time=MAX_TIMEOUT,
111)
112async def download_page_async(
113 page_url: str,
114 *,
115 cleanup: bool = False,
116 text_only: bool = False,
117 user_agent: str | None = None,
118 use_browser: bool = False,
119 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
120) -> str | None:
121 page_html: str | None
122 if use_browser:
123 page_html = await __download_via_browser(
124 page_url,
125 user_agent=user_agent,
126 pre_actions=pre_actions,
127 )
128 else:
129 page_html = await __download_via_request(
130 page_url,
131 user_agent=user_agent,
132 )
133 if page_html is None:
134 return None
136 if cleanup:
137 page_html = await cleanup_page(page_html)
139 if text_only:
140 h = HTML2Text()
141 h.ignore_images = True
142 h.ignore_links = True
143 return h.handle(page_html)
145 return page_html
148def download_page(
149 image_url: str,
150 **kwargs: Any,
151) -> str | None:
152 return asyncio.run(download_page_async(
153 image_url,
154 **kwargs,
155 ))