Coverage for src/extratools_html/__init__.py: 50%

101 statements  

« prev     ^ index     » next       coverage.py v7.8.1, created at 2025-05-29 18:41 -0700

1from __future__ import annotations 

2 

3import asyncio 

4import ssl 

5from collections.abc import Iterable 

6from contextlib import suppress 

7from datetime import UTC, datetime, timedelta 

8from enum import StrEnum 

9from http import HTTPStatus 

10from typing import Any, cast 

11from urllib.parse import urlparse 

12 

13import backoff 

14import httpx 

15import minify_html 

16import truststore 

17from blob_dict.blob import StrBlob 

18from blob_dict.dict.path import LocalPath, PathBlobDict 

19from extratools_core.typing import PathLike 

20from html2text import HTML2Text 

21 

22with suppress(ImportError): 

23 from playwright.async_api import Browser, async_playwright, expect 

24 from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

25 

26from .cleanup import cleanup_page 

27 

28MAX_TRIES: int = 3 

29MAX_TIMEOUT: int = 60 

30REQUEST_TIMEOUT: int = 10 

31# In milliseconds 

32PRE_ACTION_TIMEOUT: int = 10 * 1_000 

33 

34ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 

35 

36# TODO: Make cache path/TTL configurable via configuration file (in TOML). 

37# It will also allow support for other non-local path (like `CloudPath`). 

38CACHE_PATH: PathLike = LocalPath("~/.http-cache").expanduser() 

39CACHE_TTL: timedelta = timedelta(days=1) 

40 

41cache = PathBlobDict(CACHE_PATH, blob_class=StrBlob) 

42cache.create() 

43 

44 

45class PageElementAction(StrEnum): 

46 CLICK = "click" 

47 TO_BE_VISIBLE = "to_be_visible" 

48 

49 

50async def __download_via_request( 

51 page_url: str, 

52 *, 

53 user_agent: str | None = None, 

54) -> str | None: 

55 # https://www.python-httpx.org/advanced/ssl/ 

56 async with httpx.AsyncClient(verify=ctx) as client: 

57 response: httpx.Response = await client.get( 

58 page_url, 

59 follow_redirects=True, 

60 timeout=REQUEST_TIMEOUT, 

61 headers=( 

62 { 

63 "User-Agent": user_agent, 

64 } if user_agent 

65 else {} 

66 ), 

67 ) 

68 

69 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

70 # It also triggers backoff if necessary 

71 return None 

72 

73 response.raise_for_status() 

74 

75 return response.text 

76 

77 

78async def __download_via_browser( 

79 page_url: str, 

80 *, 

81 user_agent: str | None = None, 

82 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

83) -> str | None: 

84 async with async_playwright() as playwright: 

85 browser: Browser = await playwright.chromium.launch() 

86 await browser.new_context( 

87 user_agent=user_agent, 

88 ) 

89 

90 page = await browser.new_page() 

91 await page.route( 

92 "**/*", 

93 lambda route: ( 

94 route.abort() 

95 # https://playwright.dev/python/docs/api/class-request#request-resource-type 

96 if route.request.resource_type in { 

97 "font", 

98 "image", 

99 "media", 

100 } 

101 else route.continue_() 

102 ), 

103 ) 

104 response = await page.goto(page_url) 

105 if not response: 

106 return None 

107 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

108 # It also triggers backoff if necessary 

109 return None 

110 

111 for selector, action in pre_actions or []: 

112 with suppress(AssertionError, PlaywrightTimeoutError): 

113 match action: 

114 case PageElementAction.CLICK: 

115 await page.locator(selector).click( 

116 timeout=PRE_ACTION_TIMEOUT, 

117 # Allow click even current element is covered by other elements. 

118 # Otherwise, other pre-actions are needed before this pre-action 

119 # to dismiss those covering elements. 

120 # However, it is possible that dismissing those covering elements 

121 # is necessary logic for page to function properly. 

122 force=True, 

123 ) 

124 case PageElementAction.TO_BE_VISIBLE: 

125 await expect(page.locator(selector)).to_be_visible( 

126 timeout=PRE_ACTION_TIMEOUT, 

127 ) 

128 

129 html: str = await page.content() 

130 

131 await browser.close() 

132 

133 return html 

134 

135 

136def get_cache_key(page_url: str) -> str: 

137 parse_result = urlparse(page_url) 

138 

139 # Need to handle reserved characters for filename 

140 # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words 

141 dir: str = parse_result.netloc.replace(":", "_") 

142 

143 path: str = parse_result.path or "/" 

144 # Add default filename 

145 if path.endswith("/"): 

146 path += "?" 

147 

148 if parse_result.query: 

149 if not path.endswith("/?"): 

150 path += "?" 

151 path += parse_result.query 

152 

153 return dir + path 

154 

155 

156def is_cache_expired(cache_key: str) -> bool: 

157 path: PathLike = CACHE_PATH / cache_key 

158 if not path.exists(): 

159 return True 

160 

161 return ( 

162 datetime.now(UTC) 

163 - datetime.fromtimestamp(path.stat().st_mtime, UTC) 

164 > CACHE_TTL 

165 ) 

166 

167 

168@backoff.on_predicate( 

169 backoff.expo, 

170 max_tries=MAX_TRIES, 

171 max_time=MAX_TIMEOUT, 

172) 

173async def download_page_async( 

174 page_url: str, 

175 *, 

176 cleanup: bool = False, 

177 text_only: bool = False, 

178 minify: bool = True, 

179 user_agent: str | None = None, 

180 use_browser: bool = False, 

181 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

182 use_cache: bool = True, 

183) -> str | None: 

184 page_html: str | None 

185 cache_key: str = get_cache_key(page_url) 

186 

187 if use_cache and not is_cache_expired(cache_key): 

188 page_html = cast("StrBlob", cache[cache_key]).as_str() 

189 elif use_browser: 

190 page_html = await __download_via_browser( 

191 page_url, 

192 user_agent=user_agent, 

193 pre_actions=pre_actions, 

194 ) 

195 else: 

196 page_html = await __download_via_request( 

197 page_url, 

198 user_agent=user_agent, 

199 ) 

200 if page_html is None: 

201 return None 

202 

203 cache[cache_key] = StrBlob(page_html) 

204 

205 if minify: 

206 page_html = minify_html.minify(page_html) 

207 

208 if cleanup: 

209 page_html = await cleanup_page(page_html) 

210 

211 if text_only: 

212 h = HTML2Text() 

213 h.ignore_images = True 

214 h.ignore_links = True 

215 return h.handle(page_html) 

216 

217 return page_html 

218 

219 

220def download_page( 

221 page_url: str, 

222 **kwargs: Any, 

223) -> str | None: 

224 return asyncio.run(download_page_async( 

225 page_url, 

226 **kwargs, 

227 ))