parse_url

View Source

 1#!/usr/local/bin/python3
 2import re
 3
 4
 5def _validate_url(url):
 6    """
 7    validate the url, check if it begins with http or https
 8    check for proper domain name
 9    regex to check for valid url
10    """
11
12    return url.startswith("http") and url.startswith("https") and "facebook.com" in url and re.match(
13        r'^(?:http|ftp)s?://'  # http:// or https://
14        # domain...
15        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
16        r'localhost|'  # localhost...
17        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
18        r'(?::\d+)?'  # optional port
19        r'(?:/?|[/?]\S+)$', url, re.IGNORECASE)
20
21
22def _parse_urls():
23    """
24    check each line in the file
25    if it has m.facebook in it,
26    or if it is a valid facebook url
27    then add it to the array
28    """
29    urls = []
30    for line in lines:
31        if line.startswith('#'):
32            print("Skipping line with #")
33            continue
34        elif 'm.facebook.com' in line:
35            print("Unsupported Link: " + line)
36            exit()
37        elif 'facebook.com' in line:
38            if _validate_url(line):
39                urls.append(line)
40            else:
41                print("Invalid url: " + line)
42                exit()
43    return urls
44
45
46def main(input_file):
47    """
48    call the _parse_urls function
49    print the number of urls parsed
50    """
51    try:
52        with open(input_file, 'r') as f:
53            global lines
54            lines = f.readlines()
55            lines = [line.rstrip('\n') for line in lines]
56    except FileNotFoundError:
57        print("File not found")
58        exit()
59    except IndexError:
60        print("File is empty")
61        exit()
62    print("Parsing input urls")
63    urls = _parse_urls()
64    print("Successfully parsed url: " + str(len(urls)))
65    return urls
66
67
68if __name__ == '__main__':
69    print("This script is not meant to be run directly")

def main(input_file): View Source

47def main(input_file):
48    """
49    call the _parse_urls function
50    print the number of urls parsed
51    """
52    try:
53        with open(input_file, 'r') as f:
54            global lines
55            lines = f.readlines()
56            lines = [line.rstrip('\n') for line in lines]
57    except FileNotFoundError:
58        print("File not found")
59        exit()
60    except IndexError:
61        print("File is empty")
62        exit()
63    print("Parsing input urls")
64    urls = _parse_urls()
65    print("Successfully parsed url: " + str(len(urls)))
66    return urls

call the _parse_urls function print the number of urls parsed