Coverage for src/extratools_html/__init__.py: 42%

67 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-03 21:00 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from collections.abc import Iterable 

5from contextlib import suppress 

6from enum import StrEnum 

7from http import HTTPStatus 

8from typing import Any 

9 

10import backoff 

11import httpx 

12import truststore 

13from html2text import HTML2Text 

14 

15with suppress(ImportError): 

16 from playwright.async_api import Browser, async_playwright, expect 

17 from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

18 

19from .cleanup import cleanup_page 

20 

21truststore.inject_into_ssl() 

22 

23MAX_TRIES: int = 3 

24MAX_TIMEOUT: int = 60 

25REQUEST_TIMEOUT: int = 10 

26# In milliseconds 

27PRE_ACTION_TIMEOUT: int = 10 * 1_000 

28 

29 

30class PageElementAction(StrEnum): 

31 CLICK = "click" 

32 TO_BE_VISIBLE = "to_be_visible" 

33 

34 

35async def __download_via_request( 

36 page_url: str, 

37 *, 

38 user_agent: str | None = None, 

39) -> str | None: 

40 async with httpx.AsyncClient() as client: 

41 response: httpx.Response = await client.get( 

42 page_url, 

43 follow_redirects=True, 

44 timeout=REQUEST_TIMEOUT, 

45 headers=( 

46 { 

47 "User-Agent": user_agent, 

48 } if user_agent 

49 else {} 

50 ), 

51 ) 

52 

53 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

54 # It also triggers backoff if necessary 

55 return None 

56 

57 response.raise_for_status() 

58 

59 return response.text 

60 

61 

62async def __download_via_browser( 

63 page_url: str, 

64 *, 

65 user_agent: str | None = None, 

66 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

67) -> str | None: 

68 async with async_playwright() as playwright: 

69 browser: Browser = await playwright.chromium.launch() 

70 await browser.new_context( 

71 user_agent=user_agent, 

72 ) 

73 

74 page = await browser.new_page() 

75 response = await page.goto(page_url) 

76 if not response: 

77 return None 

78 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

79 # It also triggers backoff if necessary 

80 return None 

81 

82 for selector, action in pre_actions or []: 

83 with suppress(AssertionError, PlaywrightTimeoutError): 

84 match action: 

85 case PageElementAction.CLICK: 

86 await page.locator(selector).click( 

87 timeout=PRE_ACTION_TIMEOUT, 

88 # Allow click even current element is covered by other elements. 

89 # Otherwise, other pre-actions are needed before this pre-action 

90 # to dismiss those covering elements. 

91 # However, it is possible that dismissing those covering elements 

92 # is necessary logic for page to function properly. 

93 force=True, 

94 ) 

95 case PageElementAction.TO_BE_VISIBLE: 

96 await expect(page.locator(selector)).to_be_visible( 

97 timeout=PRE_ACTION_TIMEOUT, 

98 ) 

99 

100 html: str = await page.content() 

101 

102 await browser.close() 

103 

104 return html 

105 

106 

107@backoff.on_predicate( 

108 backoff.expo, 

109 max_tries=MAX_TRIES, 

110 max_time=MAX_TIMEOUT, 

111) 

112async def download_page_async( 

113 page_url: str, 

114 *, 

115 cleanup: bool = False, 

116 text_only: bool = False, 

117 user_agent: str | None = None, 

118 use_browser: bool = False, 

119 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

120) -> str | None: 

121 page_html: str | None 

122 if use_browser: 

123 page_html = await __download_via_browser( 

124 page_url, 

125 user_agent=user_agent, 

126 pre_actions=pre_actions, 

127 ) 

128 else: 

129 page_html = await __download_via_request( 

130 page_url, 

131 user_agent=user_agent, 

132 ) 

133 if page_html is None: 

134 return None 

135 

136 if cleanup: 

137 page_html = await cleanup_page(page_html) 

138 

139 if text_only: 

140 h = HTML2Text() 

141 h.ignore_images = True 

142 h.ignore_links = True 

143 return h.handle(page_html) 

144 

145 return page_html 

146 

147 

148def download_page( 

149 image_url: str, 

150 **kwargs: Any, 

151) -> str | None: 

152 return asyncio.run(download_page_async( 

153 image_url, 

154 **kwargs, 

155 ))