parse_url
1#!/usr/local/bin/python3 2import re 3 4 5def _validate_url(url): 6 """ 7 validate the url, check if it begins with http or https 8 check for proper domain name 9 regex to check for valid url 10 """ 11 12 return url.startswith("http") and url.startswith("https") and "facebook.com" in url and re.match( 13 r'^(?:http|ftp)s?://' # http:// or https:// 14 # domain... 15 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' 16 r'localhost|' # localhost... 17 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 18 r'(?::\d+)?' # optional port 19 r'(?:/?|[/?]\S+)$', url, re.IGNORECASE) 20 21 22def _parse_urls(): 23 """ 24 check each line in the file 25 if it has m.facebook in it, 26 or if it is a valid facebook url 27 then add it to the array 28 """ 29 urls = [] 30 for line in lines: 31 if line.startswith('#'): 32 print("Skipping line with #") 33 continue 34 elif 'm.facebook.com' in line: 35 print("Unsupported Link: " + line) 36 exit() 37 elif 'facebook.com' in line: 38 if _validate_url(line): 39 urls.append(line) 40 else: 41 print("Invalid url: " + line) 42 exit() 43 return urls 44 45 46def main(input_file): 47 """ 48 call the _parse_urls function 49 print the number of urls parsed 50 """ 51 try: 52 with open(input_file, 'r') as f: 53 global lines 54 lines = f.readlines() 55 lines = [line.rstrip('\n') for line in lines] 56 except FileNotFoundError: 57 print("File not found") 58 exit() 59 except IndexError: 60 print("File is empty") 61 exit() 62 print("Parsing input urls") 63 urls = _parse_urls() 64 print("Successfully parsed url: " + str(len(urls))) 65 return urls 66 67 68if __name__ == '__main__': 69 print("This script is not meant to be run directly")
def
main(input_file):
47def main(input_file): 48 """ 49 call the _parse_urls function 50 print the number of urls parsed 51 """ 52 try: 53 with open(input_file, 'r') as f: 54 global lines 55 lines = f.readlines() 56 lines = [line.rstrip('\n') for line in lines] 57 except FileNotFoundError: 58 print("File not found") 59 exit() 60 except IndexError: 61 print("File is empty") 62 exit() 63 print("Parsing input urls") 64 urls = _parse_urls() 65 print("Successfully parsed url: " + str(len(urls))) 66 return urls
call the _parse_urls function print the number of urls parsed