Coverage for src/extratools_html/__init__.py: 41%

51 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-28 19:41 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from contextlib import suppress 

5from http import HTTPStatus 

6from typing import Any 

7 

8import backoff 

9import httpx 

10import truststore 

11from html2text import HTML2Text 

12 

13with suppress(ImportError): 

14 from playwright.async_api import Browser, async_playwright 

15 

16from .cleanup import cleanup_page 

17 

18truststore.inject_into_ssl() 

19 

20MAX_TRIES: int = 3 

21MAX_TIMEOUT: int = 60 

22REQUEST_TIMEOUT: int = 10 

23 

24 

25async def __download_via_request( 

26 page_url: str, 

27 *, 

28 user_agent: str | None = None, 

29) -> str | None: 

30 async with httpx.AsyncClient() as client: 

31 response: httpx.Response = await client.get( 

32 page_url, 

33 follow_redirects=True, 

34 timeout=REQUEST_TIMEOUT, 

35 headers=( 

36 { 

37 "User-Agent": user_agent, 

38 } if user_agent 

39 else {} 

40 ), 

41 ) 

42 

43 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

44 # It also triggers backoff if necessary 

45 return None 

46 

47 response.raise_for_status() 

48 

49 return response.text 

50 

51 

52async def __download_via_browser( 

53 page_url: str, 

54 *, 

55 user_agent: str | None = None, 

56) -> str | None: 

57 async with async_playwright() as playwright: 

58 browser: Browser = await playwright.chromium.launch() 

59 await browser.new_context( 

60 user_agent=user_agent, 

61 ) 

62 

63 page = await browser.new_page() 

64 response = await page.goto(page_url) 

65 if not response: 

66 return None 

67 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

68 # It also triggers backoff if necessary 

69 return None 

70 

71 html: str = await page.content() 

72 

73 await browser.close() 

74 

75 return html 

76 

77 

78@backoff.on_predicate( 

79 backoff.expo, 

80 max_tries=MAX_TRIES, 

81 max_time=MAX_TIMEOUT, 

82) 

83async def download_page_async( 

84 page_url: str, 

85 *, 

86 cleanup: bool = False, 

87 text_only: bool = False, 

88 user_agent: str | None = None, 

89 use_browser: bool = False, 

90) -> str | None: 

91 page_html: str | None = await ( 

92 __download_via_browser if use_browser 

93 else __download_via_request 

94 )(page_url, user_agent=user_agent) 

95 if page_html is None: 

96 return page_html 

97 

98 if cleanup: 

99 page_html = await cleanup_page(page_html) 

100 

101 if text_only: 

102 h = HTML2Text() 

103 h.ignore_images = True 

104 h.ignore_links = True 

105 return h.handle(page_html) 

106 

107 return page_html 

108 

109 

110def download_page( 

111 image_url: str, 

112 **kwargs: Any, 

113) -> str | None: 

114 return asyncio.run(download_page_async( 

115 image_url, 

116 **kwargs, 

117 ))