process_bs

View Source

 1#!/usr/local/bin/python3
 2from bs4 import BeautifulSoup
 3import re
 4import time
 5
 6
 7def main(html, outdir):
 8    """
 9    BS4 function to extract the comment-body divs
10    """
11    # Parse the HTML content of the file
12    soup = BeautifulSoup(html, 'html.parser')
13
14    # Find the element(s) in the HTML file you want to extract data from
15    # x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r
16    data = soup.find_all(
17        'div', {'class': 'x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r'})
18
19    # write the extracted data to a file
20    # filename should have the timestamp
21    filename = outdir + 'output-' + str(time.time()) + '.csv'
22    with open(filename, 'w') as file:
23        for item in data:
24            file.write("\n\"")
25            # remvoe tabs and new lines from the string
26            text = item.text.replace('\t', '').replace('\n', '')
27            text2 = re.sub(r'\s+', ' ', text)
28            file.write(text2)
29            file.write("\"")
30        file.close()
31
32
33if __name__ == '__main__':
34    print("This module cannot be run directly. Please run main.py instead.")

def main(html, outdir): View Source

 8def main(html, outdir):
 9    """
10    BS4 function to extract the comment-body divs
11    """
12    # Parse the HTML content of the file
13    soup = BeautifulSoup(html, 'html.parser')
14
15    # Find the element(s) in the HTML file you want to extract data from
16    # x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r
17    data = soup.find_all(
18        'div', {'class': 'x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r'})
19
20    # write the extracted data to a file
21    # filename should have the timestamp
22    filename = outdir + 'output-' + str(time.time()) + '.csv'
23    with open(filename, 'w') as file:
24        for item in data:
25            file.write("\n\"")
26            # remvoe tabs and new lines from the string
27            text = item.text.replace('\t', '').replace('\n', '')
28            text2 = re.sub(r'\s+', ' ', text)
29            file.write(text2)
30            file.write("\"")
31        file.close()

BS4 function to extract the comment-body divs