Skip to content

Commit 86e50df

Browse files
committed
Add example code for beautiful soup
1 parent 8bfc213 commit 86e50df

File tree

3 files changed

+75
-0
lines changed

3 files changed

+75
-0
lines changed

source-code/web-scraping/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.graphml

source-code/web-scraping/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# WebScraping
2+
Web scraping can be quite useful to gather data that is not avaialble
3+
through an API. Here, some sample code is provided for Beautiful Soup,
4+
a web scraping library that is easy to use.
5+
6+
## What is it?
7+
1. `link_web.py`: script that uses Beautiful Soup and NetworkX to
8+
create a graph representing the links between web pages, starting
9+
from a given page.

source-code/web-scraping/link_web.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env python
2+
3+
from argparse import ArgumentParser
4+
from bs4 import BeautifulSoup
5+
from collections import deque
6+
import matplotlib.pyplot as plt
7+
import networkx as nx
8+
import sys
9+
import urllib
10+
11+
def show_links(soup, out=sys.stderr):
12+
print("Opened start page '{0}'".format(soup.title.string), file=out)
13+
links = soup.find_all("a")
14+
for link in links:
15+
href = link.get('href')
16+
if href:
17+
print('\t{0}'.format(href), file=out)
18+
19+
def process_page(pages_to_do, pages_done, max_level, graph, verbose=False):
20+
if pages_to_do:
21+
page_url, level = pages_to_do.popleft()
22+
if level <= max_level:
23+
if verbose:
24+
print('{0}: {1}'.format(page_url, level))
25+
pages_done.add(page_url)
26+
try:
27+
page = urllib.request.urlopen(page_url)
28+
soup = BeautifulSoup(page, 'html5lib')
29+
links = soup.find_all("a")
30+
for link in links:
31+
href = link.get('href')
32+
if href and href.startswith('http'):
33+
if href not in pages_done:
34+
pages_to_do.append((href, level + 1))
35+
graph.add_edge(page_url, href)
36+
except urllib.error.HTTPError:
37+
print('# warning: can not handle {0}'.format(page_url),
38+
file=sys.stderr)
39+
except urllib.error.URLError:
40+
print('# warning: can not handle {0}'.format(page_url),
41+
file=sys.stderr)
42+
43+
if __name__ == '__main__':
44+
arg_parser = ArgumentParser(description='create graph of hyperlinks')
45+
arg_parser.add_argument('url', help='URL to start link analysis at')
46+
arg_parser.add_argument('--max-level', type=int, default=3,
47+
help='maximum link depth')
48+
arg_parser.add_argument('--out', help='file name for GraphML output')
49+
arg_parser.add_argument('--verbose', action='store_true',
50+
help='give verbose output')
51+
options = arg_parser.parse_args()
52+
pages_done = set()
53+
pages_to_do = deque()
54+
pages_to_do.append((options.url, 0))
55+
graph = nx.Graph()
56+
graph.add_node(options.url)
57+
while pages_to_do:
58+
process_page(pages_to_do, pages_done, options.max_level,
59+
graph, options.verbose)
60+
print('total pages scraped: {0}'.format(len(pages_done)))
61+
if options.out:
62+
nx.write_graphml(graph, options.out)
63+
else:
64+
nx.draw(graph)
65+
plt.show()

0 commit comments

Comments
 (0)