Coverage for src/extratools_html/__init__.py: 41%
51 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-28 19:41 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-28 19:41 -0700
1from __future__ import annotations
3import asyncio
4from contextlib import suppress
5from http import HTTPStatus
6from typing import Any
8import backoff
9import httpx
10import truststore
11from html2text import HTML2Text
13with suppress(ImportError):
14 from playwright.async_api import Browser, async_playwright
16from .cleanup import cleanup_page
18truststore.inject_into_ssl()
20MAX_TRIES: int = 3
21MAX_TIMEOUT: int = 60
22REQUEST_TIMEOUT: int = 10
25async def __download_via_request(
26 page_url: str,
27 *,
28 user_agent: str | None = None,
29) -> str | None:
30 async with httpx.AsyncClient() as client:
31 response: httpx.Response = await client.get(
32 page_url,
33 follow_redirects=True,
34 timeout=REQUEST_TIMEOUT,
35 headers=(
36 {
37 "User-Agent": user_agent,
38 } if user_agent
39 else {}
40 ),
41 )
43 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
44 # It also triggers backoff if necessary
45 return None
47 response.raise_for_status()
49 return response.text
52async def __download_via_browser(
53 page_url: str,
54 *,
55 user_agent: str | None = None,
56) -> str | None:
57 async with async_playwright() as playwright:
58 browser: Browser = await playwright.chromium.launch()
59 await browser.new_context(
60 user_agent=user_agent,
61 )
63 page = await browser.new_page()
64 response = await page.goto(page_url)
65 if not response:
66 return None
67 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
68 # It also triggers backoff if necessary
69 return None
71 html: str = await page.content()
73 await browser.close()
75 return html
78@backoff.on_predicate(
79 backoff.expo,
80 max_tries=MAX_TRIES,
81 max_time=MAX_TIMEOUT,
82)
83async def download_page_async(
84 page_url: str,
85 *,
86 cleanup: bool = False,
87 text_only: bool = False,
88 user_agent: str | None = None,
89 use_browser: bool = False,
90) -> str | None:
91 page_html: str | None = await (
92 __download_via_browser if use_browser
93 else __download_via_request
94 )(page_url, user_agent=user_agent)
95 if page_html is None:
96 return page_html
98 if cleanup:
99 page_html = await cleanup_page(page_html)
101 if text_only:
102 h = HTML2Text()
103 h.ignore_images = True
104 h.ignore_links = True
105 return h.handle(page_html)
107 return page_html
110def download_page(
111 image_url: str,
112 **kwargs: Any,
113) -> str | None:
114 return asyncio.run(download_page_async(
115 image_url,
116 **kwargs,
117 ))