cinzlab
diff --git a/‎ci-linkscraping/downloadpages.py‎
Lines changed: 50 additions & 0 deletions b/‎ci-linkscraping/downloadpages.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎ci-linkscraping/findlinks.py‎
Lines changed: 71 additions & 0 deletions b/‎ci-linkscraping/findlinks.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎ci-linkscraping/findlinkslocal.py‎
Lines changed: 67 additions & 0 deletions b/‎ci-linkscraping/findlinkslocal.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎ci-linkscraping/parsed-lists/readme.md‎
Lines changed: 21 additions & 0 deletions b/‎ci-linkscraping/parsed-lists/readme.md‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,50 @@
+# Download Pages
+#
+# Purpose:
+# Scrape pages from a list of URLs.
+# Takes a list of urls and reads the site, then saves the site as an HTML file.
+#
+# Created by Richie Atkinson, Callaghan Innovation, 2024
+# v0.1
+#
+# Free to a good home.
+
+# Import libraries
+import os
+import requests
+import time
+from bs4 import BeautifulSoup
+
+## Variables to set
+# Pick a folder to write the files to
+data_dir = "html"
+
+# List of URLs to download for indexing. We're manually pasting here to make sure there's some human oversight.
+# For a production environment we probably wouldn't use this method, or we'd have a more robust way of managing the list.
+# URLs should be complete, and each line must start with " and end with ",
+urls = [
+    "https://www.callaghaninnovation.govt.nz",
+    "https://www.callaghaninnovation.govt.nz/products",
+]
+
+# Check the folder to store stuff exists, and if not, make it
+os.makedirs(data_dir, exist_ok=True)
+
+# Loop through each URL in the list
+for i in range(len(urls)):
+    response = requests.get(urls[i])
+
+    # Check if the request was successful then parse the HTML content
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Clean it up in case of messy, confusing, or inconsistent formatting
+        text = soup.prettify()
+        # Create a file path for saving the HTML content, then write the content out to a file
+        text_file = os.path.join(data_dir, urls[i].split("/")[-1] + ".html")
+        with open(text_file, "w") as file:
+            file.write(text)
+        # Sleep for 30 seconds to avoid overwhelming the server
+        time.sleep(30)
+    else:
+        # Print an error message if the request failed
+        print(f"Failed to retrieve {urls[i]}. Status code: {response.status_code}")
@@ -0,0 +1,71 @@
+# Find Online Links
+#
+# Purpose:
+# Pull links from a webpage. Best used on the main page of a site.
+#
+# Outputs to screen and to a text file.
+#
+# Created by Richie Atkinson, Callaghan Innovation, 2024
+# v0.1
+#
+# Free to a good home.
+
+## Import libraries
+import requests
+from bs4 import BeautifulSoup
+import os
+import re
+
+## Variables to set
+# URL we want to find links from - we don't need the trailing /
+url_to_scrape = "https://www.callaghaninnovation.govt.nz"
+# Pick the data directory to store the file in
+data_dir = "/home/user/listsoflinks"
+
+
+# Function to extract html document from given url
+def getHTMLdocument(url):
+    # Request for HTML document of given url
+    response = requests.get(url)
+    # Response will be provided as text
+    return response.text
+
+
+def sanitize_filename(url):
+    # Remove 'http://' or 'https://'
+    filename = re.sub(r"^https?://", "", url)
+    # Remove any characters not allowed in filenames
+    filename = re.sub(r"[^\w\-_\.]", "_", filename)
+    return filename
+
+
+# Check the folder to store stuff exists, and if not, make it
+os.makedirs(data_dir, exist_ok=True)
+
+# Create a HTML document to parse through
+html_document = getHTMLdocument(url_to_scrape)
+
+# Create soup object with HTML doc
+soup = BeautifulSoup(html_document, "html.parser")
+
+# Find all the anchor (a) tags with "href"
+for link in soup.find_all(
+    "a",
+    # Uncomment below to add the base URL to the search - this is useful if the page is using anchor (a) tags for stuff that you won't want to capture
+    # attrs={'href': re.compile("^https://www.callaghaninnovation.govt.nz")}
+):
+    ## Compile the output
+    output = link.get("href")
+    # Strip the trailing / -- the other script doesn't like trailing slashes
+    outputstrip = output.rstrip("/")
+    # Print the actual URL encapsulated with "",
+    output = f'"{url_to_scrape}{outputstrip}",'
+    print(output)
+
+    # Use the URL to create a filename and save it to the desired directory
+    filename = sanitize_filename(url_to_scrape)
+    file_path = os.path.join(data_dir, f"{filename}.txt")
+
+    # Write to file
+    with open(file_path, "a") as file:
+        file.write(output + "\n")
@@ -0,0 +1,67 @@
+# Find Local Links
+#
+# Purpose:
+# Pull links from a locally saved HTML file.
+# Specific use-case: Sites with complex JS that can't be scraped by usual methods.
+# Best used on the main page of a site.
+#
+# You can parse multiple HTML files at once by saving them all to the input directory.
+#
+# Outputs to screen and to a text file.
+#
+# Created by Richie Atkinson, Callaghan Innovation, 2024
+# v0.1
+#
+# Free to a good home.
+
+## Import libraries
+from bs4 import BeautifulSoup
+import os
+
+## Variables to set
+# Set input directory you've saved your file(s) in
+input_directory = "/home/user/unparsed"
+# Base URL for links - this is usually required for sites which use relative links and a <link rel="canonical" href="https://www.callaghaninnovation.govt.nz/"/>-type meta tag
+# This can be blank if the site uses absolute URLs. You won't need the trailing slash.
+base_url = "https://www.callaghaninnovation.govt.nz"
+# Set a folder to store the output to
+data_dir = "/home/user/listsoflinks"
+
+# Check the folder to store outputs exists, and if not, make it
+os.makedirs(data_dir, exist_ok=True)
+
+# Collect all the files present in the input directory:
+for filename in os.listdir(input_directory):
+
+    # Check files have HTML extension. Ignore if not.
+    if filename.endswith(".html"):
+
+        # Join filename and path to get explicit path
+        fname = os.path.join(input_directory, filename)
+        print("Current file name ..", os.path.abspath(fname))
+
+        # Open the file to begin operations
+        with open(fname, "r") as file:
+            # Create soup object
+            soup = BeautifulSoup(file.read(), "html.parser")
+
+            # parse the html as you wish
+            for link in soup.find_all(
+                "a",
+                # Uncomment below to add the base URL to the search - this is useful if the page is using anchor (a) tags for stuff that you won't want to capture
+                # attrs={'href': re.compile("^https://www.callaghaninnovation.govt.nz")}
+            ):
+                ## Compile the output
+                output = link.get("href")
+                # Strip the trailing / -- the other script doesn't like trailing slashes
+                outputstrip = output.rstrip("/")
+                # Print the actual URL encapsultated with "",
+                output = f'"{base_url}{outputstrip}",'
+                print(output)
+                # Create a filename for saving the output
+                out_filename = filename.split(".")[0]
+                # Create a file path for saving the output
+                file_path = os.path.join(data_dir, f"{out_filename}.txt")
+                # Write to file
+                with open(file_path, "a") as file:
+                    file.write(output + "\n")
@@ -0,0 +1,21 @@
+# Below is a high-level list of pages we've indexed.
+
+You can find detailed lists above - note that some pages were not indexed programatically and will not be included here.
+
+| Page | Description |
+| --- | --- |
+| [Ministry of Primary Industries – Funding and rural support](https://mpi.govt.nz) | A selection of MPI pages on support for agricultural businesses |
+| [Work and Income – business support](https://www.workandincome.govt.nz) | A selection of Work and Income NZ pages focussing on support for small businesses |
+| [Te Puni Kokiri – Māori Enterprise](https://tpk.govt.nz) | Te Puni Kokiri pages with information for Māori businesses |
+| [New Zealand Trade and Enterprise](https://www.nzte.govt.nz) | A selection of NZTE pages targeting small businesses in New Zealand |
+| [Business.govt.nz](https://www.business.govt.nz) | Small business information from business.govt.nz |
+| [Companies Register](https://companiesoffice.govt.nz) | Information on registering your business |
+| [Research & Development Tax Incentive](https://rdti.govt.nz) | Information on the Research & Development Tax Incentive |
+| [Callaghan Innovation](https://callaghaninnovation.govt.nz) | All content from the Callaghan Innovation website |
+| [HealthTech Activator](https://callaghaninnovation.govt.nz) | Information for businesses in HealthTech |
+| [Hon Judith Collins KC - Beehive.govt.nz](https://www.beehive.govt.nz) | The Beehive profile page for our Minister of Technology Hon Judith Collins KC |
+| [Public Service Commission - Central Government Organisations](https://publicservice.govt.nz) | A list of all central government organisations in New Zealand |
+| [Web3NZ](https://web3nz.com) | Information on the Web3NZ community. |
+| [Kitmap](https://kitmap.govt.nz) | Information on scientific infrastructure and resources available for R&D in New Zealand |
+| [MSD Connected site](https://connected.govt.nz) | Support for mahi and training for small businesses. |
+| [Employment NZ](https://employment.govt.nz) | Information for employers and employees |