process_bs
1#!/usr/local/bin/python3 2from bs4 import BeautifulSoup 3import re 4import time 5 6 7def main(html, outdir): 8 """ 9 BS4 function to extract the comment-body divs 10 """ 11 # Parse the HTML content of the file 12 soup = BeautifulSoup(html, 'html.parser') 13 14 # Find the element(s) in the HTML file you want to extract data from 15 # x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r 16 data = soup.find_all( 17 'div', {'class': 'x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r'}) 18 19 # write the extracted data to a file 20 # filename should have the timestamp 21 filename = outdir + 'output-' + str(time.time()) + '.csv' 22 with open(filename, 'w') as file: 23 for item in data: 24 file.write("\n\"") 25 # remvoe tabs and new lines from the string 26 text = item.text.replace('\t', '').replace('\n', '') 27 text2 = re.sub(r'\s+', ' ', text) 28 file.write(text2) 29 file.write("\"") 30 file.close() 31 32 33if __name__ == '__main__': 34 print("This module cannot be run directly. Please run main.py instead.")
def
main(html, outdir):
8def main(html, outdir): 9 """ 10 BS4 function to extract the comment-body divs 11 """ 12 # Parse the HTML content of the file 13 soup = BeautifulSoup(html, 'html.parser') 14 15 # Find the element(s) in the HTML file you want to extract data from 16 # x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r 17 data = soup.find_all( 18 'div', {'class': 'x11i5rnm xat24cr x1mh8g0r x1vvkbs xdj266r'}) 19 20 # write the extracted data to a file 21 # filename should have the timestamp 22 filename = outdir + 'output-' + str(time.time()) + '.csv' 23 with open(filename, 'w') as file: 24 for item in data: 25 file.write("\n\"") 26 # remvoe tabs and new lines from the string 27 text = item.text.replace('\t', '').replace('\n', '') 28 text2 = re.sub(r'\s+', ' ', text) 29 file.write(text2) 30 file.write("\"") 31 file.close()
BS4 function to extract the comment-body divs