Skip to content
This repository was archived by the owner on Jul 29, 2025. It is now read-only.

Commit 66ab44e

Browse files
committed
add initial link scraping functionality and documentation
1 parent 12754d3 commit 66ab44e

File tree

7 files changed

+504
-0
lines changed

7 files changed

+504
-0
lines changed

ci-linkscraping/downloadpages.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Download Pages
2+
#
3+
# Purpose:
4+
# Scrape pages from a list of URLs.
5+
# Takes a list of urls and reads the site, then saves the site as an HTML file.
6+
#
7+
# Created by Richie Atkinson, Callaghan Innovation, 2024
8+
# v0.1
9+
#
10+
# Free to a good home.
11+
12+
# Import libraries
13+
import os
14+
import requests
15+
import time
16+
from bs4 import BeautifulSoup
17+
18+
## Variables to set
19+
# Pick a folder to write the files to
20+
data_dir = "html"
21+
22+
# List of URLs to download for indexing. We're manually pasting here to make sure there's some human oversight.
23+
# For a production environment we probably wouldn't use this method, or we'd have a more robust way of managing the list.
24+
# URLs should be complete, and each line must start with " and end with ",
25+
urls = [
26+
"https://www.callaghaninnovation.govt.nz",
27+
"https://www.callaghaninnovation.govt.nz/products",
28+
]
29+
30+
# Check the folder to store stuff exists, and if not, make it
31+
os.makedirs(data_dir, exist_ok=True)
32+
33+
# Loop through each URL in the list
34+
for i in range(len(urls)):
35+
response = requests.get(urls[i])
36+
37+
# Check if the request was successful then parse the HTML content
38+
if response.status_code == 200:
39+
soup = BeautifulSoup(response.text, "html.parser")
40+
# Clean it up in case of messy, confusing, or inconsistent formatting
41+
text = soup.prettify()
42+
# Create a file path for saving the HTML content, then write the content out to a file
43+
text_file = os.path.join(data_dir, urls[i].split("/")[-1] + ".html")
44+
with open(text_file, "w") as file:
45+
file.write(text)
46+
# Sleep for 30 seconds to avoid overwhelming the server
47+
time.sleep(30)
48+
else:
49+
# Print an error message if the request failed
50+
print(f"Failed to retrieve {urls[i]}. Status code: {response.status_code}")

ci-linkscraping/findlinks.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Find Online Links
2+
#
3+
# Purpose:
4+
# Pull links from a webpage. Best used on the main page of a site.
5+
#
6+
# Outputs to screen and to a text file.
7+
#
8+
# Created by Richie Atkinson, Callaghan Innovation, 2024
9+
# v0.1
10+
#
11+
# Free to a good home.
12+
13+
## Import libraries
14+
import requests
15+
from bs4 import BeautifulSoup
16+
import os
17+
import re
18+
19+
## Variables to set
20+
# URL we want to find links from - we don't need the trailing /
21+
url_to_scrape = "https://www.callaghaninnovation.govt.nz"
22+
# Pick the data directory to store the file in
23+
data_dir = "/home/user/listsoflinks"
24+
25+
26+
# Function to extract html document from given url
27+
def getHTMLdocument(url):
28+
# Request for HTML document of given url
29+
response = requests.get(url)
30+
# Response will be provided as text
31+
return response.text
32+
33+
34+
def sanitize_filename(url):
35+
# Remove 'http://' or 'https://'
36+
filename = re.sub(r"^https?://", "", url)
37+
# Remove any characters not allowed in filenames
38+
filename = re.sub(r"[^\w\-_\.]", "_", filename)
39+
return filename
40+
41+
42+
# Check the folder to store stuff exists, and if not, make it
43+
os.makedirs(data_dir, exist_ok=True)
44+
45+
# Create a HTML document to parse through
46+
html_document = getHTMLdocument(url_to_scrape)
47+
48+
# Create soup object with HTML doc
49+
soup = BeautifulSoup(html_document, "html.parser")
50+
51+
# Find all the anchor (a) tags with "href"
52+
for link in soup.find_all(
53+
"a",
54+
# Uncomment below to add the base URL to the search - this is useful if the page is using anchor (a) tags for stuff that you won't want to capture
55+
# attrs={'href': re.compile("^https://www.callaghaninnovation.govt.nz")}
56+
):
57+
## Compile the output
58+
output = link.get("href")
59+
# Strip the trailing / -- the other script doesn't like trailing slashes
60+
outputstrip = output.rstrip("/")
61+
# Print the actual URL encapsulated with "",
62+
output = f'"{url_to_scrape}{outputstrip}",'
63+
print(output)
64+
65+
# Use the URL to create a filename and save it to the desired directory
66+
filename = sanitize_filename(url_to_scrape)
67+
file_path = os.path.join(data_dir, f"{filename}.txt")
68+
69+
# Write to file
70+
with open(file_path, "a") as file:
71+
file.write(output + "\n")

ci-linkscraping/findlinkslocal.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Find Local Links
2+
#
3+
# Purpose:
4+
# Pull links from a locally saved HTML file.
5+
# Specific use-case: Sites with complex JS that can't be scraped by usual methods.
6+
# Best used on the main page of a site.
7+
#
8+
# You can parse multiple HTML files at once by saving them all to the input directory.
9+
#
10+
# Outputs to screen and to a text file.
11+
#
12+
# Created by Richie Atkinson, Callaghan Innovation, 2024
13+
# v0.1
14+
#
15+
# Free to a good home.
16+
17+
## Import libraries
18+
from bs4 import BeautifulSoup
19+
import os
20+
21+
## Variables to set
22+
# Set input directory you've saved your file(s) in
23+
input_directory = "/home/user/unparsed"
24+
# Base URL for links - this is usually required for sites which use relative links and a <link rel="canonical" href="https://www.callaghaninnovation.govt.nz/"/>-type meta tag
25+
# This can be blank if the site uses absolute URLs. You won't need the trailing slash.
26+
base_url = "https://www.callaghaninnovation.govt.nz"
27+
# Set a folder to store the output to
28+
data_dir = "/home/user/listsoflinks"
29+
30+
# Check the folder to store outputs exists, and if not, make it
31+
os.makedirs(data_dir, exist_ok=True)
32+
33+
# Collect all the files present in the input directory:
34+
for filename in os.listdir(input_directory):
35+
36+
# Check files have HTML extension. Ignore if not.
37+
if filename.endswith(".html"):
38+
39+
# Join filename and path to get explicit path
40+
fname = os.path.join(input_directory, filename)
41+
print("Current file name ..", os.path.abspath(fname))
42+
43+
# Open the file to begin operations
44+
with open(fname, "r") as file:
45+
# Create soup object
46+
soup = BeautifulSoup(file.read(), "html.parser")
47+
48+
# parse the html as you wish
49+
for link in soup.find_all(
50+
"a",
51+
# Uncomment below to add the base URL to the search - this is useful if the page is using anchor (a) tags for stuff that you won't want to capture
52+
# attrs={'href': re.compile("^https://www.callaghaninnovation.govt.nz")}
53+
):
54+
## Compile the output
55+
output = link.get("href")
56+
# Strip the trailing / -- the other script doesn't like trailing slashes
57+
outputstrip = output.rstrip("/")
58+
# Print the actual URL encapsultated with "",
59+
output = f'"{base_url}{outputstrip}",'
60+
print(output)
61+
# Create a filename for saving the output
62+
out_filename = filename.split(".")[0]
63+
# Create a file path for saving the output
64+
file_path = os.path.join(data_dir, f"{out_filename}.txt")
65+
# Write to file
66+
with open(file_path, "a") as file:
67+
file.write(output + "\n")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Below is a high-level list of pages we've indexed.
2+
3+
You can find detailed lists above - note that some pages were not indexed programatically and will not be included here.
4+
5+
| Page | Description |
6+
| --- | --- |
7+
| [Ministry of Primary Industries – Funding and rural support](https://mpi.govt.nz) | A selection of MPI pages on support for agricultural businesses |
8+
| [Work and Income – business support](https://www.workandincome.govt.nz) | A selection of Work and Income NZ pages focussing on support for small businesses |
9+
| [Te Puni Kokiri – Māori Enterprise](https://tpk.govt.nz) | Te Puni Kokiri pages with information for Māori businesses |
10+
| [New Zealand Trade and Enterprise](https://www.nzte.govt.nz) | A selection of NZTE pages targeting small businesses in New Zealand |
11+
| [Business.govt.nz](https://www.business.govt.nz) | Small business information from business.govt.nz |
12+
| [Companies Register](https://companiesoffice.govt.nz) | Information on registering your business |
13+
| [Research & Development Tax Incentive](https://rdti.govt.nz) | Information on the Research & Development Tax Incentive |
14+
| [Callaghan Innovation](https://callaghaninnovation.govt.nz) | All content from the Callaghan Innovation website |
15+
| [HealthTech Activator](https://callaghaninnovation.govt.nz) | Information for businesses in HealthTech |
16+
| [Hon Judith Collins KC - Beehive.govt.nz](https://www.beehive.govt.nz) | The Beehive profile page for our Minister of Technology Hon Judith Collins KC |
17+
| [Public Service Commission - Central Government Organisations](https://publicservice.govt.nz) | A list of all central government organisations in New Zealand |
18+
| [Web3NZ](https://web3nz.com) | Information on the Web3NZ community. |
19+
| [Kitmap](https://kitmap.govt.nz) | Information on scientific infrastructure and resources available for R&D in New Zealand |
20+
| [MSD Connected site](https://connected.govt.nz) | Support for mahi and training for small businesses. |
21+
| [Employment NZ](https://employment.govt.nz) | Information for employers and employees |

0 commit comments

Comments
 (0)