Coverage for src/extratools_html/__init__.py: 39%
49 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 05:28 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 05:28 -0700
1from __future__ import annotations
3import asyncio
4from contextlib import suppress
5from http import HTTPStatus
6from typing import Any
8import backoff
9import httpx
10from html2text import HTML2Text
12with suppress(ImportError):
13 from playwright.async_api import Browser, async_playwright
15from .cleanup import cleanup_page
17MAX_TRIES: int = 3
18MAX_TIMEOUT: int = 60
19REQUEST_TIMEOUT: int = 10
22async def __download_via_request(
23 page_url: str,
24 *,
25 user_agent: str | None = None,
26) -> str | None:
27 async with httpx.AsyncClient() as client:
28 response: httpx.Response = await client.get(
29 page_url,
30 follow_redirects=True,
31 timeout=REQUEST_TIMEOUT,
32 headers=(
33 {
34 "User-Agent": user_agent,
35 } if user_agent
36 else {}
37 ),
38 )
40 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
41 # It also triggers backoff if necessary
42 return None
44 response.raise_for_status()
46 return response.text
49async def __download_via_browser(
50 page_url: str,
51 *,
52 user_agent: str | None = None,
53) -> str | None:
54 async with async_playwright() as playwright:
55 browser: Browser = await playwright.chromium.launch()
56 await browser.new_context(
57 user_agent=user_agent,
58 )
60 page = await browser.new_page()
61 response = await page.goto(page_url)
62 if not response:
63 return None
64 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
65 # It also triggers backoff if necessary
66 return None
68 html: str = await page.content()
70 await browser.close()
72 return html
75@backoff.on_predicate(
76 backoff.expo,
77 max_tries=MAX_TRIES,
78 max_time=MAX_TIMEOUT,
79)
80async def download_page_async(
81 page_url: str,
82 *,
83 cleanup: bool = False,
84 text_only: bool = False,
85 user_agent: str | None = None,
86 use_browser: bool = False,
87) -> str | None:
88 page_html: str | None = await (
89 __download_via_browser if use_browser
90 else __download_via_request
91 )(page_url, user_agent=user_agent)
92 if page_html is None:
93 return page_html
95 if cleanup:
96 page_html = await cleanup_page(page_html)
98 if text_only:
99 h = HTML2Text()
100 h.ignore_images = True
101 h.ignore_links = True
102 return h.handle(page_html)
104 return page_html
107def download_page(
108 image_url: str,
109 **kwargs: Any,
110) -> str | None:
111 return asyncio.run(download_page_async(
112 image_url,
113 **kwargs,
114 ))