|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +from argparse import ArgumentParser |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +from collections import deque |
| 6 | +import matplotlib.pyplot as plt |
| 7 | +import networkx as nx |
| 8 | +import sys |
| 9 | +import urllib |
| 10 | + |
| 11 | +def show_links(soup, out=sys.stderr): |
| 12 | + print("Opened start page '{0}'".format(soup.title.string), file=out) |
| 13 | + links = soup.find_all("a") |
| 14 | + for link in links: |
| 15 | + href = link.get('href') |
| 16 | + if href: |
| 17 | + print('\t{0}'.format(href), file=out) |
| 18 | + |
| 19 | +def process_page(pages_to_do, pages_done, max_level, graph, verbose=False): |
| 20 | + if pages_to_do: |
| 21 | + page_url, level = pages_to_do.popleft() |
| 22 | + if level <= max_level: |
| 23 | + if verbose: |
| 24 | + print('{0}: {1}'.format(page_url, level)) |
| 25 | + pages_done.add(page_url) |
| 26 | + try: |
| 27 | + page = urllib.request.urlopen(page_url) |
| 28 | + soup = BeautifulSoup(page, 'html5lib') |
| 29 | + links = soup.find_all("a") |
| 30 | + for link in links: |
| 31 | + href = link.get('href') |
| 32 | + if href and href.startswith('http'): |
| 33 | + if href not in pages_done: |
| 34 | + pages_to_do.append((href, level + 1)) |
| 35 | + graph.add_edge(page_url, href) |
| 36 | + except urllib.error.HTTPError: |
| 37 | + print('# warning: can not handle {0}'.format(page_url), |
| 38 | + file=sys.stderr) |
| 39 | + except urllib.error.URLError: |
| 40 | + print('# warning: can not handle {0}'.format(page_url), |
| 41 | + file=sys.stderr) |
| 42 | + |
| 43 | +if __name__ == '__main__': |
| 44 | + arg_parser = ArgumentParser(description='create graph of hyperlinks') |
| 45 | + arg_parser.add_argument('url', help='URL to start link analysis at') |
| 46 | + arg_parser.add_argument('--max-level', type=int, default=3, |
| 47 | + help='maximum link depth') |
| 48 | + arg_parser.add_argument('--out', help='file name for GraphML output') |
| 49 | + arg_parser.add_argument('--verbose', action='store_true', |
| 50 | + help='give verbose output') |
| 51 | + options = arg_parser.parse_args() |
| 52 | + pages_done = set() |
| 53 | + pages_to_do = deque() |
| 54 | + pages_to_do.append((options.url, 0)) |
| 55 | + graph = nx.Graph() |
| 56 | + graph.add_node(options.url) |
| 57 | + while pages_to_do: |
| 58 | + process_page(pages_to_do, pages_done, options.max_level, |
| 59 | + graph, options.verbose) |
| 60 | + print('total pages scraped: {0}'.format(len(pages_done))) |
| 61 | + if options.out: |
| 62 | + nx.write_graphml(graph, options.out) |
| 63 | + else: |
| 64 | + nx.draw(graph) |
| 65 | + plt.show() |
0 commit comments