seleniumuser.seleniumuser
1import atexit 2import os 3import random 4import sys 5import time 6from pathlib import Path 7from types import LambdaType 8from typing import Any 9from warnings import warn 10 11from bs4 import BeautifulSoup 12from selenium import webdriver 13from selenium.webdriver.chrome.options import Options as ChromeOptions 14from selenium.webdriver.chrome.service import Service as ChromeService 15from selenium.webdriver.common.by import By 16from selenium.webdriver.common.keys import Keys 17from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 18from selenium.webdriver.firefox.options import Options as FirefoxOptions 19from selenium.webdriver.firefox.service import Service as FirefoxService 20from selenium.webdriver.remote.webelement import WebElement 21from selenium.webdriver.support.ui import Select 22 23from noiftimer import Timer 24from voxscribe import get_text_from_url 25from whosyouragent import get_agent 26 27 28class User: 29 """Sits on top of selenium to streamline 30 automation and scraping tasks.""" 31 32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.timer = Timer() 93 self.timer.start() 94 self.randomize_user_agent = randomize_user_agent 95 self.user_agent_rotation_period = user_agent_rotation_period 96 self.locator_method = locator_method 97 self.turbo() 98 self.keys = Keys 99 self.move_window_by = move_window_by 100 self.download_dir = download_dir 101 self.driver_path = driver_path 102 if not self.driver_path: 103 self.search_for_driver() 104 if open_browser: 105 self.open_browser() 106 else: 107 self.browser = None 108 atexit.register(self.close_browser) 109 110 def __enter__(self): 111 return self 112 113 def __exit__(self, *args): 114 self.close_browser() 115 116 def configure_firefox(self) -> FirefoxService: 117 """Configure options and profile for firefox.""" 118 self.options = FirefoxOptions() 119 self.options.headless = self.headless 120 self.options.set_preference( 121 "widget.windows.window_occlusion_tracking.enabled", False 122 ) 123 self.options.set_preference("dom.webaudio.enabled", False) 124 if self.randomize_user_agent: 125 self.options.set_preference("general.useragent.override", get_agent()) 126 if self.download_dir: 127 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 128 self.profile = FirefoxProfile() 129 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 130 self.profile.set_preference("browser.download.folderList", 2) 131 else: 132 self.profile = None 133 self.service = FirefoxService( 134 executable_path=str(self.driver_path), log_path=os.devnull 135 ) 136 137 def configure_chrome(self) -> ChromeService: 138 """Configure options and profile for chrome.""" 139 self.options = ChromeOptions() 140 self.options.headless = self.headless 141 self.options.add_argument("--disable-blink-features=AutomationControlled") 142 self.options.add_argument("--mute-audio") 143 self.options.add_argument("--disable-infobars") 144 self.options.add_argument("--disable-notifications") 145 self.options.add_argument("--log-level=3") 146 if self.randomize_user_agent: 147 self.options.add_argument(f"--user-agent={get_agent()}") 148 self.options.add_experimental_option("useAutomationExtension", False) 149 if self.download_dir: 150 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 151 self.options.add_experimental_option( 152 "prefs", {"download.default_directory": str(self.download_dir)} 153 ) 154 self.service = ChromeService( 155 executable_path=str(self.driver_path), log_path=os.devnull 156 ) 157 158 def search_for_driver(self): 159 """Searches for the webdriver executable.""" 160 cwd = Path.cwd() 161 found = False 162 match self.browser_type: 163 case "firefox": 164 driver = "geckodriver.exe" 165 case "chrome": 166 driver = "chromedriver.exe" 167 # search PATH 168 env_path = os.environ["PATH"] 169 if sys.platform == "win32": 170 env_paths = env_path.split(";") 171 else: 172 env_paths = env_path.split(":") 173 driver = driver[: driver.find(".")] 174 for path in env_paths: 175 if (Path(path) / driver).exists(): 176 self.driver_path = Path(path) / driver 177 found = True 178 break 179 # check current working directory and parent folders 180 if not found: 181 while cwd != cwd.parent: 182 if (cwd / driver).exists(): 183 self.driver_path = cwd / driver 184 found = True 185 break 186 cwd = cwd.parent 187 # check top most level 188 if not found and (cwd / driver).exists(): 189 self.driver_path = cwd / driver 190 found = True 191 # check child folders (only 1 level down) 192 if not found: 193 for child in Path.cwd().iterdir(): 194 if child.is_dir() and (child / driver).exists(): 195 self.driver_path = child / driver 196 found = True 197 if not found: 198 warn(f"Could not find {driver}") 199 200 def set_implicit_wait(self, wait_time: int = None): 201 """Sets to default time if no arg given.""" 202 if not wait_time: 203 self.browser.implicitly_wait(self.implicit_wait) 204 else: 205 self.browser.implicitly_wait(wait_time) 206 207 def open_browser(self): 208 """Configures and opens selenium browser.""" 209 if not self.browser_open: 210 match self.browser_type: 211 case "firefox": 212 self.configure_firefox() 213 self.browser = webdriver.Firefox( 214 options=self.options, 215 service=self.service, 216 firefox_profile=self.profile, 217 ) 218 case "chrome": 219 self.configure_chrome() 220 self.browser = webdriver.Chrome( 221 options=self.options, service=self.service 222 ) 223 self.set_implicit_wait() 224 self.browser.maximize_window() 225 self.browser.set_window_position( 226 self.move_window_by[0], self.move_window_by[1] 227 ) 228 self.browser.maximize_window() 229 self.browser.set_page_load_timeout(self.page_load_timeout) 230 self.browser_open = True 231 self.tab_index = 0 232 self.rotation_timer.start() 233 else: 234 warn("Browser already open.") 235 236 def close_browser(self): 237 """Close browser window.""" 238 if self.browser_open: 239 self.browser_open = False 240 self.browser.quit() 241 242 def open_tab(self, url: str = "", switch_to_tab: bool = True): 243 """Opens new tab and, if provided, goes to url. 244 245 New tab is inserted after currently active tab.""" 246 self.script("window.open(arguments[0]);", url) 247 if switch_to_tab: 248 self.switch_to_tab(self.tab_index + 1) 249 250 def switch_to_tab(self, tab_index: int): 251 """Switch to a tab in browser, zero indexed.""" 252 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 253 self.tab_index = tab_index 254 255 def get_num_tabs(self) -> int: 256 """Returns number of tabs open.""" 257 return len(self.browser.window_handles) 258 259 def close_tab(self, tab_index: int = 1): 260 """Close specified tab and 261 switches to tab index 0.""" 262 self.switch_to_tab(tab_index) 263 self.browser.close() 264 self.switch_to_tab(0) 265 266 def get(self, url: str): 267 """Requests webpage at given url and rotates userAgent if necessary.""" 268 if not self.browser_open: 269 self.open_browser() 270 if ( 271 self.randomize_user_agent 272 and self.user_agent_rotation_period is not None 273 and self.rotation_timer.check(format=False) 274 > (60 * self.user_agent_rotation_period) 275 ): 276 self.rotation_timer.stop() 277 self.close_browser() 278 self.open_browser() 279 self.browser.get(url) 280 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 281 self.chill(self.arrival_wait) 282 283 def get_soup(self) -> BeautifulSoup: 284 """Returns a BeautifulSoup object 285 of the current page source.""" 286 return BeautifulSoup(self.browser.page_source, "html.parser") 287 288 def current_url(self) -> str: 289 """Returns current url of active tab.""" 290 return self.browser.current_url 291 292 def delete_cookies(self): 293 """Delete all cookies for 294 this browser instance.""" 295 self.browser.delete_all_cookies() 296 297 def turbo(self, engage: bool = True): 298 """When engaged, strings will be sent 299 to elements all at once and there will be 300 no waiting after actions. 301 302 When disengaged, strings will be sent to elements 303 'one key at a time' with randomized amounts of 304 time between successive keys and after actions.""" 305 if engage: 306 self.after_key_wait = (0, 0) 307 self.after_field_wait = (0, 0) 308 self.after_click_wait = (0, 0) 309 self.arrival_wait = (1, 1) 310 self.one_key_at_a_time = False 311 self.turbo_engaged = True 312 else: 313 self.after_key_wait = (0.1, 0.5) 314 self.after_field_wait = (1, 2) 315 self.after_click_wait = (0.25, 1.5) 316 self.arrival_wait = (4, 10) 317 self.one_key_at_a_time = True 318 self.turbo_engaged = False 319 320 def chill(self, min_max: tuple[float, float]): 321 """Sleeps a random amount 322 between min_max[0] and min_max[1].""" 323 time.sleep(random.uniform(min_max[0], min_max[1])) 324 325 def script(self, script: str, args: Any = None) -> Any: 326 """Execute javascript code and returns result.""" 327 return self.browser.execute_script(script, args) 328 329 def remove(self, locator: str): 330 """Removes element from DOM.""" 331 self.script("arguments[0].remove();", self.find(locator)) 332 333 def get_length(self, locator: str) -> int: 334 """Returns number of child elements for a given element.""" 335 return int(self.script("return arguments[0].length;", self.find(locator))) 336 337 def find(self, locator: str) -> WebElement: 338 """Finds and returns a WebElement.""" 339 match self.locator_method: 340 case "xpath": 341 return self.browser.find_element(By.XPATH, locator) 342 case "id": 343 return self.browser.find_element(By.ID, locator) 344 case "className": 345 return self.browser.find_element(By.CLASS_NAME, locator) 346 case "name": 347 return self.browser.find_element(By.NAME, locator) 348 case "cssSelector": 349 return self.browser.find_element(By.CSS_SELECTOR, locator) 350 351 def find_children(self, locator: str) -> list[WebElement]: 352 """Returns a list of child WebElements 353 for given locator arg.""" 354 element = self.find(locator) 355 return element.find_elements("xpath", "./*") 356 357 def scroll(self, amount: int = None, fraction: float = None): 358 """Scroll web page. 359 :param amount: The number of lines to scroll if not None. 360 361 :param fraction: The amount between 0.0 and 1.0 362 of the page height to scroll. 363 364 If values are provided for both arguments, 365 amount will be used. 366 367 If values are provided for neither argument, 368 the entire page length will be scrolled. 369 370 Scrolls one line at a time if self.turbo is False.""" 371 if amount: 372 amount_to_scroll = amount 373 elif fraction: 374 amount_to_scroll = int( 375 fraction 376 * ( 377 int(self.script("return document.body.scrollHeight;")) 378 - int(self.script("return window.pageYOffset;")) 379 ) 380 ) 381 else: 382 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 383 if self.turbo_engaged: 384 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 385 else: 386 for _ in range(abs(amount_to_scroll)): 387 if amount_to_scroll >= 0: 388 self.script("window.scrollBy(0,1);") 389 else: 390 self.script("window.scrollBy(0,-1);") 391 self.chill(self.after_click_wait) 392 393 def scroll_into_view(self, locator: str) -> WebElement: 394 """Scrolls to a given element and returns the element.""" 395 element = self.find(locator) 396 self.script("arguments[0].scroll_into_view();", element) 397 self.chill(self.after_click_wait) 398 return element 399 400 def text(self, locator: str) -> str: 401 """Returns text of WebElement.""" 402 return self.find(locator).text 403 404 def click(self, locator: str) -> WebElement: 405 """Clicks on and returns WebElement.""" 406 element = self.find(locator) 407 element.click() 408 self.chill(self.after_click_wait) 409 return element 410 411 def clear(self, locator: str) -> WebElement: 412 """Clears content of WebElement if able 413 and then returns WebElement.""" 414 element = self.find(locator) 415 element.clear() 416 self.chill(self.after_click_wait) 417 return element 418 419 def switch_to_iframe(self, locator: str): 420 """Switch to an iframe from given locator.""" 421 self.browser.switch_to.frame(self.find(locator)) 422 423 def switch_to_parent_frame(self): 424 """Move up a frame level from current frame.""" 425 self.browser.switch_to.parent_frame() 426 427 def select( 428 self, locator: str, method: str, choice: str | int | tuple 429 ) -> WebElement: 430 """Select a choice from Select element. 431 Returns the Select element from the locator string, 432 not the option element that is selected. 433 434 :param method: Can be 'value' or 'index' 435 436 :param choice: The option to select. 437 438 If method is 'value', then choice should be 439 the html 'value' attribute of the desired option. 440 441 If method is 'index', choice can either be a single 442 int for the desired option or it can be a two-tuple. 443 If the tuple is provided, a random option between the 444 two indicies (inclusive) will be selected.""" 445 element = self.click(locator) 446 match method: 447 case "value": 448 Select(element).select_by_value(choice) 449 case "index": 450 if type(choice) == tuple: 451 choice = random.randint(choice[0], choice[1]) 452 Select(element).select_by_index(choice) 453 self.chill(self.after_field_wait) 454 return element 455 456 def click_elements( 457 self, locators: list[str], max_selections: int = None, min_selections: int = 1 458 ) -> WebElement: 459 """Click a random number of WebElements 460 and return the last WebElement clicked. 461 462 :param locators: A list of element locators to choose from. 463 464 :param max_selections: The maximum number of elements to click. 465 If None, the maximum will be the length of the locators list. 466 467 :param min_selections: The minimum number of elements to click. 468 469 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 470 will click between 1 and 3 random elements from the list. 471 """ 472 if not max_selections: 473 max_selections = len(locators) 474 for option in random.sample( 475 locators, k=random.randint(min_selections, max_selections) 476 ): 477 element = self.click(option) 478 return element 479 480 def get_click_list( 481 self, num_options: int, max_choices: int = 1, min_choices: int = 1 482 ) -> list[str]: 483 """Similar to self.click_elements(), but for use with the self.fill_next() method. 484 485 Creates a list of length 'num_options' where every element is 'skip'. 486 487 A random number of elements in the list between 'min_choices' and 'max_choices' are 488 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 489 click_list = ["skip"] * num_options 490 selected_indexes = [] 491 for i in range(random.randint(min_choices, max_choices)): 492 index = random.randint(0, num_options - 1) 493 while index in selected_indexes: 494 index = random.randint(0, num_options - 1) 495 selected_indexes.append(index) 496 click_list[index] = self.keys.SPACE 497 return click_list 498 499 def send_keys( 500 self, 501 locator: str, 502 data: str, 503 click_first: bool = True, 504 clear_first: bool = False, 505 ) -> WebElement: 506 """Types data into element and returns the element. 507 508 :param data: The string to send to the element. 509 510 :param click_first: If True, the element is clicked on 511 before the data is sent. 512 513 :param clear_first: If True, the current text of the element 514 is cleared before the data is sent.""" 515 element = self.click(locator) if click_first else self.find(locator) 516 if clear_first: 517 element.clear() 518 self.chill(self.after_click_wait) 519 if self.one_key_at_a_time: 520 for ch in str(data): 521 element.send_keys(ch) 522 self.chill(self.after_key_wait) 523 else: 524 element.send_keys(str(data)) 525 self.chill(self.after_field_wait) 526 return element 527 528 def fill_next( 529 self, data: list[str | tuple], start_element: WebElement = None 530 ) -> WebElement: 531 """Fills a form by tabbing from the current WebElement 532 to the next one and using the corresponding item in data. 533 Returns the last WebElement. 534 535 :param data: A list of form data. If an item is a string (except for 'skip') 536 it will be typed into the current WebElement. 537 538 An item in data can be a two-tuple of the form 539 ('downArrow', numberOfPresses:int|tuple[int, int]). 540 541 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 542 that many times to the WebElement. 543 544 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 545 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 546 This is typically for use with Select elements. 547 548 An item in data can also be 'skip', which will perform no action on the current 549 WebElement and will continue to the next one. 550 551 :param start_element: The WebElement to start tabbing from. 552 The currently active element will be used if start_element is None. 553 554 Note: The function tabs to the next element before sending data, 555 so the start_element should the WebElement before the one 556 that should receive data[0]. 557 """ 558 element = ( 559 self.browser.switch_to.active_element 560 if not start_element 561 else start_element 562 ) 563 for datum in data: 564 element.send_keys(Keys.TAB) 565 element = self.browser.switch_to.active_element 566 self.chill(self.after_key_wait) 567 if datum[0] == "downArrow": 568 if type(datum[1]) == tuple: 569 times = random.randint(datum[1][0], datum[1][1]) 570 else: 571 times = datum[1] 572 for _ in range(times): 573 element.send_keys(Keys.ARROW_DOWN) 574 self.chill(self.after_key_wait) 575 elif datum == "skip": 576 self.chill(self.after_key_wait) 577 else: 578 if self.turbo_engaged: 579 element.send_keys(str(datum)) 580 else: 581 for ch in str(datum): 582 element.send_keys(ch) 583 self.chill(self.after_key_wait) 584 self.chill(self.after_field_wait) 585 return element 586 587 def wait_until( 588 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 589 ): 590 """Checks condition repeatedly until either it is true, 591 or the max_wait is exceeded. 592 593 Raises a TimeoutError if the condition doesn't success within max_wait. 594 595 Useful for determing whether a form has been successfully submitted. 596 597 :param condition: The condition function to check. 598 599 :param max_wait: Number of seconds to continue checking condition 600 before throwing a TimeoutError. 601 602 :param polling_interval: The number of seconds to sleep before 603 checking the condition function again after it fails. 604 605 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 606 start_time = time.time() 607 while True: 608 try: 609 if condition(): 610 time.sleep(1) 611 break 612 elif (time.time() - start_time) > max_wait: 613 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 614 else: 615 time.sleep(polling_interval) 616 except: 617 if (time.time() - start_time) > max_wait: 618 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 619 else: 620 time.sleep(polling_interval) 621 622 def dismiss_alert(self): 623 """Dismiss alert dialog.""" 624 self.browser.switch_to.alert.dismiss() 625 626 def solve_recaptcha_v3( 627 self, 628 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 629 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 630 ): 631 """Pass google recaptcha v3 by solving an audio puzzle. 632 633 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 634 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 635 pass None to this argument. 636 637 """ 638 locator_method = self.locator_method 639 self.locator_method = "xpath" 640 try: 641 if outer_iframe_xpath: 642 self.switch_to_iframe(outer_iframe_xpath) 643 self.click('//*[@id="recaptcha-anchor"]') 644 self.switch_to_parent_frame() 645 self.switch_to_iframe(inner_iframe_xpath) 646 self.click('//*[@id="recaptcha-audio-button"]') 647 mp3_url = self.find( 648 '//a[@class="rc-audiochallenge-tdownload-link"]' 649 ).get_attribute("href") 650 text = get_text_from_url(mp3_url, ".mp3") 651 self.send_keys('//*[@id="audio-response"]', text) 652 self.click('//*[@id="recaptcha-verify-button"]') 653 except Exception as e: 654 print(e) 655 raise Exception("Could not solve captcha") 656 finally: 657 self.switch_to_parent_frame() 658 self.locator_method = locator_method
29class User: 30 """Sits on top of selenium to streamline 31 automation and scraping tasks.""" 32 33 def __init__( 34 self, 35 headless: bool = False, 36 browser_type: str = "firefox", 37 implicit_wait: int = 10, 38 page_load_timeout: int = 60, 39 open_browser: bool = True, 40 locator_method: str = "xpath", 41 randomize_user_agent: bool = True, 42 user_agent_rotation_period: int = None, 43 move_window_by: tuple[int, int] = (0, -1000), 44 download_dir: str | Path = None, 45 driver_path: str | Path = None, 46 ): 47 """ 48 :param headless: If True, browser window will not be visible. 49 50 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 51 52 :param implicit_wait: Number of seconds to look for a specified element before 53 selenium considers it missing and throws an exception. 54 55 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 56 before throwing an exception. 57 58 :param open_browser: If True, opens a browser window when a User object is created. 59 If False, a manual call to self.open_browser() must be made. 60 61 :param locator_method: The locator type User should expect to be given. 62 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 63 Every member function with a 'locator' argument refers to a string matching 64 the current locator_method. 65 66 :param randomize_user_agent: If True, a random useragent will be used whenever 67 the browser is opened. If False, the native useragent will be used. 68 69 :param user_agent_rotation_period: If not None, the browser window will be closed 70 and reopened with a new useragent every user_agent_rotation_period number of minutes. 71 Rotation occurs on the first call to self.get() after the time period has elapsed. 72 Ignored if randomize_user_agent is False. 73 74 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 75 76 :param download_dir: The download folder to use. If None, the default folder will be used. 77 78 :param driver_path: The path to the webdriver executable selenium should use. 79 If None, the system PATH will be checked for the executable. 80 If the executable isn't found, the parent directories and the immediate child directories 81 of the current working directory will be searched. 82 """ 83 self.headless = headless 84 browser_type = browser_type.lower() 85 if browser_type in ["firefox", "chrome"]: 86 self.browser_type = browser_type 87 else: 88 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 89 self.browser_open = False 90 self.implicit_wait = implicit_wait 91 self.page_load_timeout = page_load_timeout 92 self.rotation_timer = Timer() 93 self.timer = Timer() 94 self.timer.start() 95 self.randomize_user_agent = randomize_user_agent 96 self.user_agent_rotation_period = user_agent_rotation_period 97 self.locator_method = locator_method 98 self.turbo() 99 self.keys = Keys 100 self.move_window_by = move_window_by 101 self.download_dir = download_dir 102 self.driver_path = driver_path 103 if not self.driver_path: 104 self.search_for_driver() 105 if open_browser: 106 self.open_browser() 107 else: 108 self.browser = None 109 atexit.register(self.close_browser) 110 111 def __enter__(self): 112 return self 113 114 def __exit__(self, *args): 115 self.close_browser() 116 117 def configure_firefox(self) -> FirefoxService: 118 """Configure options and profile for firefox.""" 119 self.options = FirefoxOptions() 120 self.options.headless = self.headless 121 self.options.set_preference( 122 "widget.windows.window_occlusion_tracking.enabled", False 123 ) 124 self.options.set_preference("dom.webaudio.enabled", False) 125 if self.randomize_user_agent: 126 self.options.set_preference("general.useragent.override", get_agent()) 127 if self.download_dir: 128 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 129 self.profile = FirefoxProfile() 130 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 131 self.profile.set_preference("browser.download.folderList", 2) 132 else: 133 self.profile = None 134 self.service = FirefoxService( 135 executable_path=str(self.driver_path), log_path=os.devnull 136 ) 137 138 def configure_chrome(self) -> ChromeService: 139 """Configure options and profile for chrome.""" 140 self.options = ChromeOptions() 141 self.options.headless = self.headless 142 self.options.add_argument("--disable-blink-features=AutomationControlled") 143 self.options.add_argument("--mute-audio") 144 self.options.add_argument("--disable-infobars") 145 self.options.add_argument("--disable-notifications") 146 self.options.add_argument("--log-level=3") 147 if self.randomize_user_agent: 148 self.options.add_argument(f"--user-agent={get_agent()}") 149 self.options.add_experimental_option("useAutomationExtension", False) 150 if self.download_dir: 151 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 152 self.options.add_experimental_option( 153 "prefs", {"download.default_directory": str(self.download_dir)} 154 ) 155 self.service = ChromeService( 156 executable_path=str(self.driver_path), log_path=os.devnull 157 ) 158 159 def search_for_driver(self): 160 """Searches for the webdriver executable.""" 161 cwd = Path.cwd() 162 found = False 163 match self.browser_type: 164 case "firefox": 165 driver = "geckodriver.exe" 166 case "chrome": 167 driver = "chromedriver.exe" 168 # search PATH 169 env_path = os.environ["PATH"] 170 if sys.platform == "win32": 171 env_paths = env_path.split(";") 172 else: 173 env_paths = env_path.split(":") 174 driver = driver[: driver.find(".")] 175 for path in env_paths: 176 if (Path(path) / driver).exists(): 177 self.driver_path = Path(path) / driver 178 found = True 179 break 180 # check current working directory and parent folders 181 if not found: 182 while cwd != cwd.parent: 183 if (cwd / driver).exists(): 184 self.driver_path = cwd / driver 185 found = True 186 break 187 cwd = cwd.parent 188 # check top most level 189 if not found and (cwd / driver).exists(): 190 self.driver_path = cwd / driver 191 found = True 192 # check child folders (only 1 level down) 193 if not found: 194 for child in Path.cwd().iterdir(): 195 if child.is_dir() and (child / driver).exists(): 196 self.driver_path = child / driver 197 found = True 198 if not found: 199 warn(f"Could not find {driver}") 200 201 def set_implicit_wait(self, wait_time: int = None): 202 """Sets to default time if no arg given.""" 203 if not wait_time: 204 self.browser.implicitly_wait(self.implicit_wait) 205 else: 206 self.browser.implicitly_wait(wait_time) 207 208 def open_browser(self): 209 """Configures and opens selenium browser.""" 210 if not self.browser_open: 211 match self.browser_type: 212 case "firefox": 213 self.configure_firefox() 214 self.browser = webdriver.Firefox( 215 options=self.options, 216 service=self.service, 217 firefox_profile=self.profile, 218 ) 219 case "chrome": 220 self.configure_chrome() 221 self.browser = webdriver.Chrome( 222 options=self.options, service=self.service 223 ) 224 self.set_implicit_wait() 225 self.browser.maximize_window() 226 self.browser.set_window_position( 227 self.move_window_by[0], self.move_window_by[1] 228 ) 229 self.browser.maximize_window() 230 self.browser.set_page_load_timeout(self.page_load_timeout) 231 self.browser_open = True 232 self.tab_index = 0 233 self.rotation_timer.start() 234 else: 235 warn("Browser already open.") 236 237 def close_browser(self): 238 """Close browser window.""" 239 if self.browser_open: 240 self.browser_open = False 241 self.browser.quit() 242 243 def open_tab(self, url: str = "", switch_to_tab: bool = True): 244 """Opens new tab and, if provided, goes to url. 245 246 New tab is inserted after currently active tab.""" 247 self.script("window.open(arguments[0]);", url) 248 if switch_to_tab: 249 self.switch_to_tab(self.tab_index + 1) 250 251 def switch_to_tab(self, tab_index: int): 252 """Switch to a tab in browser, zero indexed.""" 253 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 254 self.tab_index = tab_index 255 256 def get_num_tabs(self) -> int: 257 """Returns number of tabs open.""" 258 return len(self.browser.window_handles) 259 260 def close_tab(self, tab_index: int = 1): 261 """Close specified tab and 262 switches to tab index 0.""" 263 self.switch_to_tab(tab_index) 264 self.browser.close() 265 self.switch_to_tab(0) 266 267 def get(self, url: str): 268 """Requests webpage at given url and rotates userAgent if necessary.""" 269 if not self.browser_open: 270 self.open_browser() 271 if ( 272 self.randomize_user_agent 273 and self.user_agent_rotation_period is not None 274 and self.rotation_timer.check(format=False) 275 > (60 * self.user_agent_rotation_period) 276 ): 277 self.rotation_timer.stop() 278 self.close_browser() 279 self.open_browser() 280 self.browser.get(url) 281 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 282 self.chill(self.arrival_wait) 283 284 def get_soup(self) -> BeautifulSoup: 285 """Returns a BeautifulSoup object 286 of the current page source.""" 287 return BeautifulSoup(self.browser.page_source, "html.parser") 288 289 def current_url(self) -> str: 290 """Returns current url of active tab.""" 291 return self.browser.current_url 292 293 def delete_cookies(self): 294 """Delete all cookies for 295 this browser instance.""" 296 self.browser.delete_all_cookies() 297 298 def turbo(self, engage: bool = True): 299 """When engaged, strings will be sent 300 to elements all at once and there will be 301 no waiting after actions. 302 303 When disengaged, strings will be sent to elements 304 'one key at a time' with randomized amounts of 305 time between successive keys and after actions.""" 306 if engage: 307 self.after_key_wait = (0, 0) 308 self.after_field_wait = (0, 0) 309 self.after_click_wait = (0, 0) 310 self.arrival_wait = (1, 1) 311 self.one_key_at_a_time = False 312 self.turbo_engaged = True 313 else: 314 self.after_key_wait = (0.1, 0.5) 315 self.after_field_wait = (1, 2) 316 self.after_click_wait = (0.25, 1.5) 317 self.arrival_wait = (4, 10) 318 self.one_key_at_a_time = True 319 self.turbo_engaged = False 320 321 def chill(self, min_max: tuple[float, float]): 322 """Sleeps a random amount 323 between min_max[0] and min_max[1].""" 324 time.sleep(random.uniform(min_max[0], min_max[1])) 325 326 def script(self, script: str, args: Any = None) -> Any: 327 """Execute javascript code and returns result.""" 328 return self.browser.execute_script(script, args) 329 330 def remove(self, locator: str): 331 """Removes element from DOM.""" 332 self.script("arguments[0].remove();", self.find(locator)) 333 334 def get_length(self, locator: str) -> int: 335 """Returns number of child elements for a given element.""" 336 return int(self.script("return arguments[0].length;", self.find(locator))) 337 338 def find(self, locator: str) -> WebElement: 339 """Finds and returns a WebElement.""" 340 match self.locator_method: 341 case "xpath": 342 return self.browser.find_element(By.XPATH, locator) 343 case "id": 344 return self.browser.find_element(By.ID, locator) 345 case "className": 346 return self.browser.find_element(By.CLASS_NAME, locator) 347 case "name": 348 return self.browser.find_element(By.NAME, locator) 349 case "cssSelector": 350 return self.browser.find_element(By.CSS_SELECTOR, locator) 351 352 def find_children(self, locator: str) -> list[WebElement]: 353 """Returns a list of child WebElements 354 for given locator arg.""" 355 element = self.find(locator) 356 return element.find_elements("xpath", "./*") 357 358 def scroll(self, amount: int = None, fraction: float = None): 359 """Scroll web page. 360 :param amount: The number of lines to scroll if not None. 361 362 :param fraction: The amount between 0.0 and 1.0 363 of the page height to scroll. 364 365 If values are provided for both arguments, 366 amount will be used. 367 368 If values are provided for neither argument, 369 the entire page length will be scrolled. 370 371 Scrolls one line at a time if self.turbo is False.""" 372 if amount: 373 amount_to_scroll = amount 374 elif fraction: 375 amount_to_scroll = int( 376 fraction 377 * ( 378 int(self.script("return document.body.scrollHeight;")) 379 - int(self.script("return window.pageYOffset;")) 380 ) 381 ) 382 else: 383 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 384 if self.turbo_engaged: 385 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 386 else: 387 for _ in range(abs(amount_to_scroll)): 388 if amount_to_scroll >= 0: 389 self.script("window.scrollBy(0,1);") 390 else: 391 self.script("window.scrollBy(0,-1);") 392 self.chill(self.after_click_wait) 393 394 def scroll_into_view(self, locator: str) -> WebElement: 395 """Scrolls to a given element and returns the element.""" 396 element = self.find(locator) 397 self.script("arguments[0].scroll_into_view();", element) 398 self.chill(self.after_click_wait) 399 return element 400 401 def text(self, locator: str) -> str: 402 """Returns text of WebElement.""" 403 return self.find(locator).text 404 405 def click(self, locator: str) -> WebElement: 406 """Clicks on and returns WebElement.""" 407 element = self.find(locator) 408 element.click() 409 self.chill(self.after_click_wait) 410 return element 411 412 def clear(self, locator: str) -> WebElement: 413 """Clears content of WebElement if able 414 and then returns WebElement.""" 415 element = self.find(locator) 416 element.clear() 417 self.chill(self.after_click_wait) 418 return element 419 420 def switch_to_iframe(self, locator: str): 421 """Switch to an iframe from given locator.""" 422 self.browser.switch_to.frame(self.find(locator)) 423 424 def switch_to_parent_frame(self): 425 """Move up a frame level from current frame.""" 426 self.browser.switch_to.parent_frame() 427 428 def select( 429 self, locator: str, method: str, choice: str | int | tuple 430 ) -> WebElement: 431 """Select a choice from Select element. 432 Returns the Select element from the locator string, 433 not the option element that is selected. 434 435 :param method: Can be 'value' or 'index' 436 437 :param choice: The option to select. 438 439 If method is 'value', then choice should be 440 the html 'value' attribute of the desired option. 441 442 If method is 'index', choice can either be a single 443 int for the desired option or it can be a two-tuple. 444 If the tuple is provided, a random option between the 445 two indicies (inclusive) will be selected.""" 446 element = self.click(locator) 447 match method: 448 case "value": 449 Select(element).select_by_value(choice) 450 case "index": 451 if type(choice) == tuple: 452 choice = random.randint(choice[0], choice[1]) 453 Select(element).select_by_index(choice) 454 self.chill(self.after_field_wait) 455 return element 456 457 def click_elements( 458 self, locators: list[str], max_selections: int = None, min_selections: int = 1 459 ) -> WebElement: 460 """Click a random number of WebElements 461 and return the last WebElement clicked. 462 463 :param locators: A list of element locators to choose from. 464 465 :param max_selections: The maximum number of elements to click. 466 If None, the maximum will be the length of the locators list. 467 468 :param min_selections: The minimum number of elements to click. 469 470 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 471 will click between 1 and 3 random elements from the list. 472 """ 473 if not max_selections: 474 max_selections = len(locators) 475 for option in random.sample( 476 locators, k=random.randint(min_selections, max_selections) 477 ): 478 element = self.click(option) 479 return element 480 481 def get_click_list( 482 self, num_options: int, max_choices: int = 1, min_choices: int = 1 483 ) -> list[str]: 484 """Similar to self.click_elements(), but for use with the self.fill_next() method. 485 486 Creates a list of length 'num_options' where every element is 'skip'. 487 488 A random number of elements in the list between 'min_choices' and 'max_choices' are 489 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 490 click_list = ["skip"] * num_options 491 selected_indexes = [] 492 for i in range(random.randint(min_choices, max_choices)): 493 index = random.randint(0, num_options - 1) 494 while index in selected_indexes: 495 index = random.randint(0, num_options - 1) 496 selected_indexes.append(index) 497 click_list[index] = self.keys.SPACE 498 return click_list 499 500 def send_keys( 501 self, 502 locator: str, 503 data: str, 504 click_first: bool = True, 505 clear_first: bool = False, 506 ) -> WebElement: 507 """Types data into element and returns the element. 508 509 :param data: The string to send to the element. 510 511 :param click_first: If True, the element is clicked on 512 before the data is sent. 513 514 :param clear_first: If True, the current text of the element 515 is cleared before the data is sent.""" 516 element = self.click(locator) if click_first else self.find(locator) 517 if clear_first: 518 element.clear() 519 self.chill(self.after_click_wait) 520 if self.one_key_at_a_time: 521 for ch in str(data): 522 element.send_keys(ch) 523 self.chill(self.after_key_wait) 524 else: 525 element.send_keys(str(data)) 526 self.chill(self.after_field_wait) 527 return element 528 529 def fill_next( 530 self, data: list[str | tuple], start_element: WebElement = None 531 ) -> WebElement: 532 """Fills a form by tabbing from the current WebElement 533 to the next one and using the corresponding item in data. 534 Returns the last WebElement. 535 536 :param data: A list of form data. If an item is a string (except for 'skip') 537 it will be typed into the current WebElement. 538 539 An item in data can be a two-tuple of the form 540 ('downArrow', numberOfPresses:int|tuple[int, int]). 541 542 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 543 that many times to the WebElement. 544 545 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 546 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 547 This is typically for use with Select elements. 548 549 An item in data can also be 'skip', which will perform no action on the current 550 WebElement and will continue to the next one. 551 552 :param start_element: The WebElement to start tabbing from. 553 The currently active element will be used if start_element is None. 554 555 Note: The function tabs to the next element before sending data, 556 so the start_element should the WebElement before the one 557 that should receive data[0]. 558 """ 559 element = ( 560 self.browser.switch_to.active_element 561 if not start_element 562 else start_element 563 ) 564 for datum in data: 565 element.send_keys(Keys.TAB) 566 element = self.browser.switch_to.active_element 567 self.chill(self.after_key_wait) 568 if datum[0] == "downArrow": 569 if type(datum[1]) == tuple: 570 times = random.randint(datum[1][0], datum[1][1]) 571 else: 572 times = datum[1] 573 for _ in range(times): 574 element.send_keys(Keys.ARROW_DOWN) 575 self.chill(self.after_key_wait) 576 elif datum == "skip": 577 self.chill(self.after_key_wait) 578 else: 579 if self.turbo_engaged: 580 element.send_keys(str(datum)) 581 else: 582 for ch in str(datum): 583 element.send_keys(ch) 584 self.chill(self.after_key_wait) 585 self.chill(self.after_field_wait) 586 return element 587 588 def wait_until( 589 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 590 ): 591 """Checks condition repeatedly until either it is true, 592 or the max_wait is exceeded. 593 594 Raises a TimeoutError if the condition doesn't success within max_wait. 595 596 Useful for determing whether a form has been successfully submitted. 597 598 :param condition: The condition function to check. 599 600 :param max_wait: Number of seconds to continue checking condition 601 before throwing a TimeoutError. 602 603 :param polling_interval: The number of seconds to sleep before 604 checking the condition function again after it fails. 605 606 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 607 start_time = time.time() 608 while True: 609 try: 610 if condition(): 611 time.sleep(1) 612 break 613 elif (time.time() - start_time) > max_wait: 614 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 615 else: 616 time.sleep(polling_interval) 617 except: 618 if (time.time() - start_time) > max_wait: 619 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 620 else: 621 time.sleep(polling_interval) 622 623 def dismiss_alert(self): 624 """Dismiss alert dialog.""" 625 self.browser.switch_to.alert.dismiss() 626 627 def solve_recaptcha_v3( 628 self, 629 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 630 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 631 ): 632 """Pass google recaptcha v3 by solving an audio puzzle. 633 634 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 635 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 636 pass None to this argument. 637 638 """ 639 locator_method = self.locator_method 640 self.locator_method = "xpath" 641 try: 642 if outer_iframe_xpath: 643 self.switch_to_iframe(outer_iframe_xpath) 644 self.click('//*[@id="recaptcha-anchor"]') 645 self.switch_to_parent_frame() 646 self.switch_to_iframe(inner_iframe_xpath) 647 self.click('//*[@id="recaptcha-audio-button"]') 648 mp3_url = self.find( 649 '//a[@class="rc-audiochallenge-tdownload-link"]' 650 ).get_attribute("href") 651 text = get_text_from_url(mp3_url, ".mp3") 652 self.send_keys('//*[@id="audio-response"]', text) 653 self.click('//*[@id="recaptcha-verify-button"]') 654 except Exception as e: 655 print(e) 656 raise Exception("Could not solve captcha") 657 finally: 658 self.switch_to_parent_frame() 659 self.locator_method = locator_method
Sits on top of selenium to streamline automation and scraping tasks.
33 def __init__( 34 self, 35 headless: bool = False, 36 browser_type: str = "firefox", 37 implicit_wait: int = 10, 38 page_load_timeout: int = 60, 39 open_browser: bool = True, 40 locator_method: str = "xpath", 41 randomize_user_agent: bool = True, 42 user_agent_rotation_period: int = None, 43 move_window_by: tuple[int, int] = (0, -1000), 44 download_dir: str | Path = None, 45 driver_path: str | Path = None, 46 ): 47 """ 48 :param headless: If True, browser window will not be visible. 49 50 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 51 52 :param implicit_wait: Number of seconds to look for a specified element before 53 selenium considers it missing and throws an exception. 54 55 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 56 before throwing an exception. 57 58 :param open_browser: If True, opens a browser window when a User object is created. 59 If False, a manual call to self.open_browser() must be made. 60 61 :param locator_method: The locator type User should expect to be given. 62 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 63 Every member function with a 'locator' argument refers to a string matching 64 the current locator_method. 65 66 :param randomize_user_agent: If True, a random useragent will be used whenever 67 the browser is opened. If False, the native useragent will be used. 68 69 :param user_agent_rotation_period: If not None, the browser window will be closed 70 and reopened with a new useragent every user_agent_rotation_period number of minutes. 71 Rotation occurs on the first call to self.get() after the time period has elapsed. 72 Ignored if randomize_user_agent is False. 73 74 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 75 76 :param download_dir: The download folder to use. If None, the default folder will be used. 77 78 :param driver_path: The path to the webdriver executable selenium should use. 79 If None, the system PATH will be checked for the executable. 80 If the executable isn't found, the parent directories and the immediate child directories 81 of the current working directory will be searched. 82 """ 83 self.headless = headless 84 browser_type = browser_type.lower() 85 if browser_type in ["firefox", "chrome"]: 86 self.browser_type = browser_type 87 else: 88 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 89 self.browser_open = False 90 self.implicit_wait = implicit_wait 91 self.page_load_timeout = page_load_timeout 92 self.rotation_timer = Timer() 93 self.timer = Timer() 94 self.timer.start() 95 self.randomize_user_agent = randomize_user_agent 96 self.user_agent_rotation_period = user_agent_rotation_period 97 self.locator_method = locator_method 98 self.turbo() 99 self.keys = Keys 100 self.move_window_by = move_window_by 101 self.download_dir = download_dir 102 self.driver_path = driver_path 103 if not self.driver_path: 104 self.search_for_driver() 105 if open_browser: 106 self.open_browser() 107 else: 108 self.browser = None 109 atexit.register(self.close_browser)
Parameters
headless: If True, browser window will not be visible.
browser_type: Which browser to use. Can be 'firefox' or 'chrome'.
implicit_wait: Number of seconds to look for a specified element before selenium considers it missing and throws an exception.
page_load_timeout: Time in seconds for selenium to wait for a page to load before throwing an exception.
open_browser: If True, opens a browser window when a User object is created. If False, a manual call to self.open_browser() must be made.
locator_method: The locator type User should expect to be given. Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. Every member function with a 'locator' argument refers to a string matching the current locator_method.
randomize_user_agent: If True, a random useragent will be used whenever the browser is opened. If False, the native useragent will be used.
user_agent_rotation_period: If not None, the browser window will be closed and reopened with a new useragent every user_agent_rotation_period number of minutes. Rotation occurs on the first call to self.get() after the time period has elapsed. Ignored if randomize_user_agent is False.
move_window_by: The x and y amount of pixels to move the browser window by after opening.
download_dir: The download folder to use. If None, the default folder will be used.
driver_path: The path to the webdriver executable selenium should use. If None, the system PATH will be checked for the executable. If the executable isn't found, the parent directories and the immediate child directories of the current working directory will be searched.
117 def configure_firefox(self) -> FirefoxService: 118 """Configure options and profile for firefox.""" 119 self.options = FirefoxOptions() 120 self.options.headless = self.headless 121 self.options.set_preference( 122 "widget.windows.window_occlusion_tracking.enabled", False 123 ) 124 self.options.set_preference("dom.webaudio.enabled", False) 125 if self.randomize_user_agent: 126 self.options.set_preference("general.useragent.override", get_agent()) 127 if self.download_dir: 128 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 129 self.profile = FirefoxProfile() 130 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 131 self.profile.set_preference("browser.download.folderList", 2) 132 else: 133 self.profile = None 134 self.service = FirefoxService( 135 executable_path=str(self.driver_path), log_path=os.devnull 136 )
Configure options and profile for firefox.
138 def configure_chrome(self) -> ChromeService: 139 """Configure options and profile for chrome.""" 140 self.options = ChromeOptions() 141 self.options.headless = self.headless 142 self.options.add_argument("--disable-blink-features=AutomationControlled") 143 self.options.add_argument("--mute-audio") 144 self.options.add_argument("--disable-infobars") 145 self.options.add_argument("--disable-notifications") 146 self.options.add_argument("--log-level=3") 147 if self.randomize_user_agent: 148 self.options.add_argument(f"--user-agent={get_agent()}") 149 self.options.add_experimental_option("useAutomationExtension", False) 150 if self.download_dir: 151 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 152 self.options.add_experimental_option( 153 "prefs", {"download.default_directory": str(self.download_dir)} 154 ) 155 self.service = ChromeService( 156 executable_path=str(self.driver_path), log_path=os.devnull 157 )
Configure options and profile for chrome.
159 def search_for_driver(self): 160 """Searches for the webdriver executable.""" 161 cwd = Path.cwd() 162 found = False 163 match self.browser_type: 164 case "firefox": 165 driver = "geckodriver.exe" 166 case "chrome": 167 driver = "chromedriver.exe" 168 # search PATH 169 env_path = os.environ["PATH"] 170 if sys.platform == "win32": 171 env_paths = env_path.split(";") 172 else: 173 env_paths = env_path.split(":") 174 driver = driver[: driver.find(".")] 175 for path in env_paths: 176 if (Path(path) / driver).exists(): 177 self.driver_path = Path(path) / driver 178 found = True 179 break 180 # check current working directory and parent folders 181 if not found: 182 while cwd != cwd.parent: 183 if (cwd / driver).exists(): 184 self.driver_path = cwd / driver 185 found = True 186 break 187 cwd = cwd.parent 188 # check top most level 189 if not found and (cwd / driver).exists(): 190 self.driver_path = cwd / driver 191 found = True 192 # check child folders (only 1 level down) 193 if not found: 194 for child in Path.cwd().iterdir(): 195 if child.is_dir() and (child / driver).exists(): 196 self.driver_path = child / driver 197 found = True 198 if not found: 199 warn(f"Could not find {driver}")
Searches for the webdriver executable.
201 def set_implicit_wait(self, wait_time: int = None): 202 """Sets to default time if no arg given.""" 203 if not wait_time: 204 self.browser.implicitly_wait(self.implicit_wait) 205 else: 206 self.browser.implicitly_wait(wait_time)
Sets to default time if no arg given.
208 def open_browser(self): 209 """Configures and opens selenium browser.""" 210 if not self.browser_open: 211 match self.browser_type: 212 case "firefox": 213 self.configure_firefox() 214 self.browser = webdriver.Firefox( 215 options=self.options, 216 service=self.service, 217 firefox_profile=self.profile, 218 ) 219 case "chrome": 220 self.configure_chrome() 221 self.browser = webdriver.Chrome( 222 options=self.options, service=self.service 223 ) 224 self.set_implicit_wait() 225 self.browser.maximize_window() 226 self.browser.set_window_position( 227 self.move_window_by[0], self.move_window_by[1] 228 ) 229 self.browser.maximize_window() 230 self.browser.set_page_load_timeout(self.page_load_timeout) 231 self.browser_open = True 232 self.tab_index = 0 233 self.rotation_timer.start() 234 else: 235 warn("Browser already open.")
Configures and opens selenium browser.
237 def close_browser(self): 238 """Close browser window.""" 239 if self.browser_open: 240 self.browser_open = False 241 self.browser.quit()
Close browser window.
243 def open_tab(self, url: str = "", switch_to_tab: bool = True): 244 """Opens new tab and, if provided, goes to url. 245 246 New tab is inserted after currently active tab.""" 247 self.script("window.open(arguments[0]);", url) 248 if switch_to_tab: 249 self.switch_to_tab(self.tab_index + 1)
Opens new tab and, if provided, goes to url.
New tab is inserted after currently active tab.
251 def switch_to_tab(self, tab_index: int): 252 """Switch to a tab in browser, zero indexed.""" 253 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 254 self.tab_index = tab_index
Switch to a tab in browser, zero indexed.
256 def get_num_tabs(self) -> int: 257 """Returns number of tabs open.""" 258 return len(self.browser.window_handles)
Returns number of tabs open.
260 def close_tab(self, tab_index: int = 1): 261 """Close specified tab and 262 switches to tab index 0.""" 263 self.switch_to_tab(tab_index) 264 self.browser.close() 265 self.switch_to_tab(0)
Close specified tab and switches to tab index 0.
267 def get(self, url: str): 268 """Requests webpage at given url and rotates userAgent if necessary.""" 269 if not self.browser_open: 270 self.open_browser() 271 if ( 272 self.randomize_user_agent 273 and self.user_agent_rotation_period is not None 274 and self.rotation_timer.check(format=False) 275 > (60 * self.user_agent_rotation_period) 276 ): 277 self.rotation_timer.stop() 278 self.close_browser() 279 self.open_browser() 280 self.browser.get(url) 281 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 282 self.chill(self.arrival_wait)
Requests webpage at given url and rotates userAgent if necessary.
284 def get_soup(self) -> BeautifulSoup: 285 """Returns a BeautifulSoup object 286 of the current page source.""" 287 return BeautifulSoup(self.browser.page_source, "html.parser")
Returns a BeautifulSoup object of the current page source.
289 def current_url(self) -> str: 290 """Returns current url of active tab.""" 291 return self.browser.current_url
Returns current url of active tab.
298 def turbo(self, engage: bool = True): 299 """When engaged, strings will be sent 300 to elements all at once and there will be 301 no waiting after actions. 302 303 When disengaged, strings will be sent to elements 304 'one key at a time' with randomized amounts of 305 time between successive keys and after actions.""" 306 if engage: 307 self.after_key_wait = (0, 0) 308 self.after_field_wait = (0, 0) 309 self.after_click_wait = (0, 0) 310 self.arrival_wait = (1, 1) 311 self.one_key_at_a_time = False 312 self.turbo_engaged = True 313 else: 314 self.after_key_wait = (0.1, 0.5) 315 self.after_field_wait = (1, 2) 316 self.after_click_wait = (0.25, 1.5) 317 self.arrival_wait = (4, 10) 318 self.one_key_at_a_time = True 319 self.turbo_engaged = False
When engaged, strings will be sent to elements all at once and there will be no waiting after actions.
When disengaged, strings will be sent to elements 'one key at a time' with randomized amounts of time between successive keys and after actions.
321 def chill(self, min_max: tuple[float, float]): 322 """Sleeps a random amount 323 between min_max[0] and min_max[1].""" 324 time.sleep(random.uniform(min_max[0], min_max[1]))
Sleeps a random amount between min_max[0] and min_max[1].
326 def script(self, script: str, args: Any = None) -> Any: 327 """Execute javascript code and returns result.""" 328 return self.browser.execute_script(script, args)
Execute javascript code and returns result.
330 def remove(self, locator: str): 331 """Removes element from DOM.""" 332 self.script("arguments[0].remove();", self.find(locator))
Removes element from DOM.
334 def get_length(self, locator: str) -> int: 335 """Returns number of child elements for a given element.""" 336 return int(self.script("return arguments[0].length;", self.find(locator)))
Returns number of child elements for a given element.
338 def find(self, locator: str) -> WebElement: 339 """Finds and returns a WebElement.""" 340 match self.locator_method: 341 case "xpath": 342 return self.browser.find_element(By.XPATH, locator) 343 case "id": 344 return self.browser.find_element(By.ID, locator) 345 case "className": 346 return self.browser.find_element(By.CLASS_NAME, locator) 347 case "name": 348 return self.browser.find_element(By.NAME, locator) 349 case "cssSelector": 350 return self.browser.find_element(By.CSS_SELECTOR, locator)
Finds and returns a WebElement.
352 def find_children(self, locator: str) -> list[WebElement]: 353 """Returns a list of child WebElements 354 for given locator arg.""" 355 element = self.find(locator) 356 return element.find_elements("xpath", "./*")
Returns a list of child WebElements for given locator arg.
358 def scroll(self, amount: int = None, fraction: float = None): 359 """Scroll web page. 360 :param amount: The number of lines to scroll if not None. 361 362 :param fraction: The amount between 0.0 and 1.0 363 of the page height to scroll. 364 365 If values are provided for both arguments, 366 amount will be used. 367 368 If values are provided for neither argument, 369 the entire page length will be scrolled. 370 371 Scrolls one line at a time if self.turbo is False.""" 372 if amount: 373 amount_to_scroll = amount 374 elif fraction: 375 amount_to_scroll = int( 376 fraction 377 * ( 378 int(self.script("return document.body.scrollHeight;")) 379 - int(self.script("return window.pageYOffset;")) 380 ) 381 ) 382 else: 383 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 384 if self.turbo_engaged: 385 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 386 else: 387 for _ in range(abs(amount_to_scroll)): 388 if amount_to_scroll >= 0: 389 self.script("window.scrollBy(0,1);") 390 else: 391 self.script("window.scrollBy(0,-1);") 392 self.chill(self.after_click_wait)
Scroll web page.
Parameters
amount: The number of lines to scroll if not None.
fraction: The amount between 0.0 and 1.0 of the page height to scroll.
If values are provided for both arguments, amount will be used.
If values are provided for neither argument, the entire page length will be scrolled.
Scrolls one line at a time if self.turbo is False.
394 def scroll_into_view(self, locator: str) -> WebElement: 395 """Scrolls to a given element and returns the element.""" 396 element = self.find(locator) 397 self.script("arguments[0].scroll_into_view();", element) 398 self.chill(self.after_click_wait) 399 return element
Scrolls to a given element and returns the element.
401 def text(self, locator: str) -> str: 402 """Returns text of WebElement.""" 403 return self.find(locator).text
Returns text of WebElement.
405 def click(self, locator: str) -> WebElement: 406 """Clicks on and returns WebElement.""" 407 element = self.find(locator) 408 element.click() 409 self.chill(self.after_click_wait) 410 return element
Clicks on and returns WebElement.
412 def clear(self, locator: str) -> WebElement: 413 """Clears content of WebElement if able 414 and then returns WebElement.""" 415 element = self.find(locator) 416 element.clear() 417 self.chill(self.after_click_wait) 418 return element
Clears content of WebElement if able and then returns WebElement.
420 def switch_to_iframe(self, locator: str): 421 """Switch to an iframe from given locator.""" 422 self.browser.switch_to.frame(self.find(locator))
Switch to an iframe from given locator.
424 def switch_to_parent_frame(self): 425 """Move up a frame level from current frame.""" 426 self.browser.switch_to.parent_frame()
Move up a frame level from current frame.
428 def select( 429 self, locator: str, method: str, choice: str | int | tuple 430 ) -> WebElement: 431 """Select a choice from Select element. 432 Returns the Select element from the locator string, 433 not the option element that is selected. 434 435 :param method: Can be 'value' or 'index' 436 437 :param choice: The option to select. 438 439 If method is 'value', then choice should be 440 the html 'value' attribute of the desired option. 441 442 If method is 'index', choice can either be a single 443 int for the desired option or it can be a two-tuple. 444 If the tuple is provided, a random option between the 445 two indicies (inclusive) will be selected.""" 446 element = self.click(locator) 447 match method: 448 case "value": 449 Select(element).select_by_value(choice) 450 case "index": 451 if type(choice) == tuple: 452 choice = random.randint(choice[0], choice[1]) 453 Select(element).select_by_index(choice) 454 self.chill(self.after_field_wait) 455 return element
Select a choice from Select element. Returns the Select element from the locator string, not the option element that is selected.
Parameters
method: Can be 'value' or 'index'
choice: The option to select.
If method is 'value', then choice should be the html 'value' attribute of the desired option.
If method is 'index', choice can either be a single int for the desired option or it can be a two-tuple. If the tuple is provided, a random option between the two indicies (inclusive) will be selected.
457 def click_elements( 458 self, locators: list[str], max_selections: int = None, min_selections: int = 1 459 ) -> WebElement: 460 """Click a random number of WebElements 461 and return the last WebElement clicked. 462 463 :param locators: A list of element locators to choose from. 464 465 :param max_selections: The maximum number of elements to click. 466 If None, the maximum will be the length of the locators list. 467 468 :param min_selections: The minimum number of elements to click. 469 470 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 471 will click between 1 and 3 random elements from the list. 472 """ 473 if not max_selections: 474 max_selections = len(locators) 475 for option in random.sample( 476 locators, k=random.randint(min_selections, max_selections) 477 ): 478 element = self.click(option) 479 return element
Click a random number of WebElements and return the last WebElement clicked.
Parameters
locators: A list of element locators to choose from.
max_selections: The maximum number of elements to click. If None, the maximum will be the length of the locators list.
min_selections: The minimum number of elements to click.
e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) will click between 1 and 3 random elements from the list.
481 def get_click_list( 482 self, num_options: int, max_choices: int = 1, min_choices: int = 1 483 ) -> list[str]: 484 """Similar to self.click_elements(), but for use with the self.fill_next() method. 485 486 Creates a list of length 'num_options' where every element is 'skip'. 487 488 A random number of elements in the list between 'min_choices' and 'max_choices' are 489 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 490 click_list = ["skip"] * num_options 491 selected_indexes = [] 492 for i in range(random.randint(min_choices, max_choices)): 493 index = random.randint(0, num_options - 1) 494 while index in selected_indexes: 495 index = random.randint(0, num_options - 1) 496 selected_indexes.append(index) 497 click_list[index] = self.keys.SPACE 498 return click_list
Similar to self.click_elements(), but for use with the self.fill_next() method.
Creates a list of length 'num_options' where every element is 'skip'.
A random number of elements in the list between 'min_choices' and 'max_choices' are replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).
500 def send_keys( 501 self, 502 locator: str, 503 data: str, 504 click_first: bool = True, 505 clear_first: bool = False, 506 ) -> WebElement: 507 """Types data into element and returns the element. 508 509 :param data: The string to send to the element. 510 511 :param click_first: If True, the element is clicked on 512 before the data is sent. 513 514 :param clear_first: If True, the current text of the element 515 is cleared before the data is sent.""" 516 element = self.click(locator) if click_first else self.find(locator) 517 if clear_first: 518 element.clear() 519 self.chill(self.after_click_wait) 520 if self.one_key_at_a_time: 521 for ch in str(data): 522 element.send_keys(ch) 523 self.chill(self.after_key_wait) 524 else: 525 element.send_keys(str(data)) 526 self.chill(self.after_field_wait) 527 return element
Types data into element and returns the element.
Parameters
data: The string to send to the element.
click_first: If True, the element is clicked on before the data is sent.
clear_first: If True, the current text of the element is cleared before the data is sent.
529 def fill_next( 530 self, data: list[str | tuple], start_element: WebElement = None 531 ) -> WebElement: 532 """Fills a form by tabbing from the current WebElement 533 to the next one and using the corresponding item in data. 534 Returns the last WebElement. 535 536 :param data: A list of form data. If an item is a string (except for 'skip') 537 it will be typed into the current WebElement. 538 539 An item in data can be a two-tuple of the form 540 ('downArrow', numberOfPresses:int|tuple[int, int]). 541 542 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 543 that many times to the WebElement. 544 545 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 546 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 547 This is typically for use with Select elements. 548 549 An item in data can also be 'skip', which will perform no action on the current 550 WebElement and will continue to the next one. 551 552 :param start_element: The WebElement to start tabbing from. 553 The currently active element will be used if start_element is None. 554 555 Note: The function tabs to the next element before sending data, 556 so the start_element should the WebElement before the one 557 that should receive data[0]. 558 """ 559 element = ( 560 self.browser.switch_to.active_element 561 if not start_element 562 else start_element 563 ) 564 for datum in data: 565 element.send_keys(Keys.TAB) 566 element = self.browser.switch_to.active_element 567 self.chill(self.after_key_wait) 568 if datum[0] == "downArrow": 569 if type(datum[1]) == tuple: 570 times = random.randint(datum[1][0], datum[1][1]) 571 else: 572 times = datum[1] 573 for _ in range(times): 574 element.send_keys(Keys.ARROW_DOWN) 575 self.chill(self.after_key_wait) 576 elif datum == "skip": 577 self.chill(self.after_key_wait) 578 else: 579 if self.turbo_engaged: 580 element.send_keys(str(datum)) 581 else: 582 for ch in str(datum): 583 element.send_keys(ch) 584 self.chill(self.after_key_wait) 585 self.chill(self.after_field_wait) 586 return element
Fills a form by tabbing from the current WebElement to the next one and using the corresponding item in data. Returns the last WebElement.
Parameters
- data: A list of form data. If an item is a string (except for 'skip') it will be typed into the current WebElement.
An item in data can be a two-tuple of the form ('downArrow', numberOfPresses:int|tuple[int, int]).
If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent that many times to the WebElement.
If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. This is typically for use with Select elements.
An item in data can also be 'skip', which will perform no action on the current WebElement and will continue to the next one.
- start_element: The WebElement to start tabbing from. The currently active element will be used if start_element is None.
Note: The function tabs to the next element before sending data, so the start_element should the WebElement before the one that should receive data[0].
588 def wait_until( 589 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 590 ): 591 """Checks condition repeatedly until either it is true, 592 or the max_wait is exceeded. 593 594 Raises a TimeoutError if the condition doesn't success within max_wait. 595 596 Useful for determing whether a form has been successfully submitted. 597 598 :param condition: The condition function to check. 599 600 :param max_wait: Number of seconds to continue checking condition 601 before throwing a TimeoutError. 602 603 :param polling_interval: The number of seconds to sleep before 604 checking the condition function again after it fails. 605 606 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 607 start_time = time.time() 608 while True: 609 try: 610 if condition(): 611 time.sleep(1) 612 break 613 elif (time.time() - start_time) > max_wait: 614 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 615 else: 616 time.sleep(polling_interval) 617 except: 618 if (time.time() - start_time) > max_wait: 619 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 620 else: 621 time.sleep(polling_interval)
Checks condition repeatedly until either it is true, or the max_wait is exceeded.
Raises a TimeoutError if the condition doesn't success within max_wait.
Useful for determing whether a form has been successfully submitted.
Parameters
condition: The condition function to check.
max_wait: Number of seconds to continue checking condition before throwing a TimeoutError.
polling_interval: The number of seconds to sleep before checking the condition function again after it fails.
e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))
623 def dismiss_alert(self): 624 """Dismiss alert dialog.""" 625 self.browser.switch_to.alert.dismiss()
Dismiss alert dialog.
627 def solve_recaptcha_v3( 628 self, 629 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 630 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 631 ): 632 """Pass google recaptcha v3 by solving an audio puzzle. 633 634 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 635 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 636 pass None to this argument. 637 638 """ 639 locator_method = self.locator_method 640 self.locator_method = "xpath" 641 try: 642 if outer_iframe_xpath: 643 self.switch_to_iframe(outer_iframe_xpath) 644 self.click('//*[@id="recaptcha-anchor"]') 645 self.switch_to_parent_frame() 646 self.switch_to_iframe(inner_iframe_xpath) 647 self.click('//*[@id="recaptcha-audio-button"]') 648 mp3_url = self.find( 649 '//a[@class="rc-audiochallenge-tdownload-link"]' 650 ).get_attribute("href") 651 text = get_text_from_url(mp3_url, ".mp3") 652 self.send_keys('//*[@id="audio-response"]', text) 653 self.click('//*[@id="recaptcha-verify-button"]') 654 except Exception as e: 655 print(e) 656 raise Exception("Could not solve captcha") 657 finally: 658 self.switch_to_parent_frame() 659 self.locator_method = locator_method
Pass google recaptcha v3 by solving an audio puzzle.
Parameters
- outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. If it's the recaptcha without the initial checkbox that just shows the image puzzle, pass None to this argument.