Coverage for src/extratools_html/__init__.py: 39%

49 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-17 05:28 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from contextlib import suppress 

5from http import HTTPStatus 

6from typing import Any 

7 

8import backoff 

9import httpx 

10from html2text import HTML2Text 

11 

12with suppress(ImportError): 

13 from playwright.async_api import Browser, async_playwright 

14 

15from .cleanup import cleanup_page 

16 

17MAX_TRIES: int = 3 

18MAX_TIMEOUT: int = 60 

19REQUEST_TIMEOUT: int = 10 

20 

21 

22async def __download_via_request( 

23 page_url: str, 

24 *, 

25 user_agent: str | None = None, 

26) -> str | None: 

27 async with httpx.AsyncClient() as client: 

28 response: httpx.Response = await client.get( 

29 page_url, 

30 follow_redirects=True, 

31 timeout=REQUEST_TIMEOUT, 

32 headers=( 

33 { 

34 "User-Agent": user_agent, 

35 } if user_agent 

36 else {} 

37 ), 

38 ) 

39 

40 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

41 # It also triggers backoff if necessary 

42 return None 

43 

44 response.raise_for_status() 

45 

46 return response.text 

47 

48 

49async def __download_via_browser( 

50 page_url: str, 

51 *, 

52 user_agent: str | None = None, 

53) -> str | None: 

54 async with async_playwright() as playwright: 

55 browser: Browser = await playwright.chromium.launch() 

56 await browser.new_context( 

57 user_agent=user_agent, 

58 ) 

59 

60 page = await browser.new_page() 

61 response = await page.goto(page_url) 

62 if not response: 

63 return None 

64 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

65 # It also triggers backoff if necessary 

66 return None 

67 

68 html: str = await page.content() 

69 

70 await browser.close() 

71 

72 return html 

73 

74 

75@backoff.on_predicate( 

76 backoff.expo, 

77 max_tries=MAX_TRIES, 

78 max_time=MAX_TIMEOUT, 

79) 

80async def download_page_async( 

81 page_url: str, 

82 *, 

83 cleanup: bool = False, 

84 text_only: bool = False, 

85 user_agent: str | None = None, 

86 use_browser: bool = False, 

87) -> str | None: 

88 page_html: str | None = await ( 

89 __download_via_browser if use_browser 

90 else __download_via_request 

91 )(page_url, user_agent=user_agent) 

92 if page_html is None: 

93 return page_html 

94 

95 if cleanup: 

96 page_html = await cleanup_page(page_html) 

97 

98 if text_only: 

99 h = HTML2Text() 

100 h.ignore_images = True 

101 h.ignore_links = True 

102 return h.handle(page_html) 

103 

104 return page_html 

105 

106 

107def download_page( 

108 image_url: str, 

109 **kwargs: Any, 

110) -> str | None: 

111 return asyncio.run(download_page_async( 

112 image_url, 

113 **kwargs, 

114 ))