fb_scrapper

View Source

 1#!/usr/local/bin/python3
 2from selenium.webdriver.chrome.options import Options
 3from selenium import webdriver
 4from selenium.webdriver.common.keys import Keys
 5import time
 6
 7import fb_helper
 8import process_bs
 9
10def set_chrome_options() -> None:
11    """
12    Sets chrome options for Selenium.
13    Chrome options for headless browser is enabled.
14    """
15    chrome_options = Options()
16    chrome_options.add_argument("--headless")
17    chrome_options.add_argument("--no-sandbox")
18    chrome_options.add_argument("--disable-dev-shm-usage")
19    chrome_prefs = {}
20    chrome_options.experimental_options["prefs"] = chrome_prefs
21    chrome_prefs["profile.default_content_settings"] = {"images": 2}
22    return chrome_options
23
24def main(email, password, urls, out_dir):
25    """
26    main scraper function
27    inputs are email, password, urls, out_dir
28    this runs an instance of chrome in headless mode
29    """
30    chrome_opts = set_chrome_options()
31    driver = webdriver.Chrome(options=chrome_opts)
32    fb_helper._login(driver, email, password)
33
34    # loop until lines array is over
35    for url in urls:
36        print("Scraping URL: " + url)
37        driver.get(url)
38        time.sleep(2)
39        fb_helper._get_comments(driver)
40        # get the page source
41        bs_data = driver.page_source
42        # pass bs_data to func
43        process_bs.main(bs_data, out_dir)
44        print("Done")
45    driver.close()
46
47if __name__ == '__main__':
48    print("This module cannot be run directly. Please run main.py instead.")

def set_chrome_options() -> None: View Source

11def set_chrome_options() -> None:
12    """
13    Sets chrome options for Selenium.
14    Chrome options for headless browser is enabled.
15    """
16    chrome_options = Options()
17    chrome_options.add_argument("--headless")
18    chrome_options.add_argument("--no-sandbox")
19    chrome_options.add_argument("--disable-dev-shm-usage")
20    chrome_prefs = {}
21    chrome_options.experimental_options["prefs"] = chrome_prefs
22    chrome_prefs["profile.default_content_settings"] = {"images": 2}
23    return chrome_options

Sets chrome options for Selenium. Chrome options for headless browser is enabled.

def main(email, password, urls, out_dir): View Source

25def main(email, password, urls, out_dir):
26    """
27    main scraper function
28    inputs are email, password, urls, out_dir
29    this runs an instance of chrome in headless mode
30    """
31    chrome_opts = set_chrome_options()
32    driver = webdriver.Chrome(options=chrome_opts)
33    fb_helper._login(driver, email, password)
34
35    # loop until lines array is over
36    for url in urls:
37        print("Scraping URL: " + url)
38        driver.get(url)
39        time.sleep(2)
40        fb_helper._get_comments(driver)
41        # get the page source
42        bs_data = driver.page_source
43        # pass bs_data to func
44        process_bs.main(bs_data, out_dir)
45        print("Done")
46    driver.close()

main scraper function inputs are email, password, urls, out_dir this runs an instance of chrome in headless mode