fb_scrapper
1#!/usr/local/bin/python3 2from selenium.webdriver.chrome.options import Options 3from selenium import webdriver 4from selenium.webdriver.common.keys import Keys 5import time 6 7import fb_helper 8import process_bs 9 10def set_chrome_options() -> None: 11 """ 12 Sets chrome options for Selenium. 13 Chrome options for headless browser is enabled. 14 """ 15 chrome_options = Options() 16 chrome_options.add_argument("--headless") 17 chrome_options.add_argument("--no-sandbox") 18 chrome_options.add_argument("--disable-dev-shm-usage") 19 chrome_prefs = {} 20 chrome_options.experimental_options["prefs"] = chrome_prefs 21 chrome_prefs["profile.default_content_settings"] = {"images": 2} 22 return chrome_options 23 24def main(email, password, urls, out_dir): 25 """ 26 main scraper function 27 inputs are email, password, urls, out_dir 28 this runs an instance of chrome in headless mode 29 """ 30 chrome_opts = set_chrome_options() 31 driver = webdriver.Chrome(options=chrome_opts) 32 fb_helper._login(driver, email, password) 33 34 # loop until lines array is over 35 for url in urls: 36 print("Scraping URL: " + url) 37 driver.get(url) 38 time.sleep(2) 39 fb_helper._get_comments(driver) 40 # get the page source 41 bs_data = driver.page_source 42 # pass bs_data to func 43 process_bs.main(bs_data, out_dir) 44 print("Done") 45 driver.close() 46 47if __name__ == '__main__': 48 print("This module cannot be run directly. Please run main.py instead.")
def
set_chrome_options() -> None:
11def set_chrome_options() -> None: 12 """ 13 Sets chrome options for Selenium. 14 Chrome options for headless browser is enabled. 15 """ 16 chrome_options = Options() 17 chrome_options.add_argument("--headless") 18 chrome_options.add_argument("--no-sandbox") 19 chrome_options.add_argument("--disable-dev-shm-usage") 20 chrome_prefs = {} 21 chrome_options.experimental_options["prefs"] = chrome_prefs 22 chrome_prefs["profile.default_content_settings"] = {"images": 2} 23 return chrome_options
Sets chrome options for Selenium. Chrome options for headless browser is enabled.
def
main(email, password, urls, out_dir):
25def main(email, password, urls, out_dir): 26 """ 27 main scraper function 28 inputs are email, password, urls, out_dir 29 this runs an instance of chrome in headless mode 30 """ 31 chrome_opts = set_chrome_options() 32 driver = webdriver.Chrome(options=chrome_opts) 33 fb_helper._login(driver, email, password) 34 35 # loop until lines array is over 36 for url in urls: 37 print("Scraping URL: " + url) 38 driver.get(url) 39 time.sleep(2) 40 fb_helper._get_comments(driver) 41 # get the page source 42 bs_data = driver.page_source 43 # pass bs_data to func 44 process_bs.main(bs_data, out_dir) 45 print("Done") 46 driver.close()
main scraper function inputs are email, password, urls, out_dir this runs an instance of chrome in headless mode