CommunityDataScienceCollective · jdfoote · Apr 13, 2020 · Apr 27, 2020 · Apr 28, 2020
diff --git a/search_engine_results/.gitignore b/search_engine_results/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+config.py
diff --git a/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python\n",
+    "# coding: utf-8\n",
+    "\n",
+    "import pathlib\n",
+    "import pprint\n",
+    "import json\n",
+    "import re\n",
+    "import requests\n",
+    "import time\n",
+    "import config\n",
+    "import argparse\n",
+    "import logging\n",
+    "import urllib.parse\n",
+    "import csv\n",
+    "from datetime import datetime\n",
+    "\n",
+    "\n",
+    "ENDPT = 'https://web.archive.org/save/'\n",
+    "UA_STRING = config.UA_STRING\n",
+    "ACCESS_KEY = config.ACCESS_KEY\n",
+    "SECRET_KEY = config.SECRET_KEY\n",
+    "HEADERS = {'Accept':'application/json',\n",
+    "           'User-Agent': UA_STRING,\n",
+    "           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n",
+    "IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "    parser = argparse.ArgumentParser(description='Creates job ')\n",
+    "    parser.add_argument('-i', help='Input directory with metadata files')\n",
+    "    parser.add_argument('-o', help='Location to save job id file')\n",
+    "    ## TODO: Maybe switch this so default is to ignore?\n",
+    "    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n",
+    "            action='store_true')\n",
+    "\n",
+    "    args = parser.parse_args()\n",
+    "\n",
+    "    \n",
+    "    # Make a list of the files that we are going to be editing (skip those already edited)\n",
+    "    files = pathlib.Path(args.i).glob('**/*.json')\n",
+    "    ## FOR TESTING ONLY!!!\n",
+    "    #files = list(files)[10:11]\n",
+    "    archive_files(files, args.o, args.ignore_self_links)\n",
+    "    \n",
+    "def archive_files(files, output_file, ignore_self_links):\n",
+    "    \n",
+    "    \n",
+    "    def get_urls_to_archive(fn):\n",
+    "        '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n",
+    "        with open(filename, 'r') as f:\n",
+    "            j_obj = json.load(f)\n",
+    "            # Get the URLs from the file\n",
+    "            query_url, link_urls = get_urls_from_json(j_obj)\n",
+    "            # Filter out the self links and search engine cache urls\n",
+    "            link_urls = filter_link_urls(query_url, link_urls)\n",
+    "        \n",
+    "        with open(output_file, 'w') as out_file:\n",
+    "            f = csv.writer(out_file)\n",
+    "            # Get outlinks for the query URL. This gets these jobs started early, so some will\n",
+    "            # hopefully be done by the time we make the calls\n",
+    "            query_job = archive_url(query_url, capture_outlinks=1)\n",
+    "            store_job_id(f, query_url, query_job)\n",
+    "            for url in link_urls:\n",
+    "                job_id = archive_url(url)\n",
+    "                store_job_id(f, url, job_id)\n",
+    "    \n",
+    "    def store_job_id(f, url, job_id):\n",
+    "        '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n",
+    "        time = datetime.now()\n",
+    "        f.writerow([time, url, job_id])\n",
+    "        completed_urls[url] = job_id\n",
+    "    \n",
+    "    def filter_link_urls(query_url,\n",
+    "                         urls,\n",
+    "                    remove_cache=True):\n",
+    "        '''\n",
+    "        Takes link urls and filters them in three ways:\n",
+    "        1. (Optionally) Ignores urls from the two caches:\n",
+    "        webcache.googleusercontent.com\n",
+    "        https://cc.bingj\n",
+    "        2. Filters out those which are in the completed_urls dictionary\n",
+    "        3. (Optionally) Identifies URLs which have the same domain as the query URL.\n",
+    "        Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n",
+    "        that we want it archived and move it from skipped to the to_archive list\n",
+    "        '''\n",
+    "        to_archive = []\n",
+    "        if ignore_self_links:\n",
+    "            domain = get_domain(query_url)\n",
+    "        else:\n",
+    "            domain = None\n",
+    "        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n",
+    "        for url in urls:\n",
+    "            if url in completed_urls:\n",
+    "                continue\n",
+    "\n",
+    "            if remove_cache == True:\n",
+    "                if re.match(cache_regex, url):\n",
+    "                    continue\n",
+    "\n",
+    "            if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n",
+    "                # If it matches, check if it's in skipped URLs\n",
+    "                # If so, remove it from there, and add it to the to_archive list\n",
+    "                if url in skipped_urls:\n",
+    "                    to_archive.append(url)\n",
+    "                    skipped_urls.remove(url)\n",
+    "                # Else, add it to the skipped urls (and skip it)\n",
+    "                else:\n",
+    "                    skipped_urls.append(url)\n",
+    "            else:\n",
+    "                to_archive.append(url)\n",
+    "        return to_archive\n",
+    "   \n",
+    "    \n",
+    "    completed_urls = dict_from_csv(output_file)\n",
+    "    skipped_urls = []\n",
+    "    attempts = 0\n",
+    "    incomplete_files = list(files)\n",
+    "    while len(incomplete_files) > 0:\n",
+    "        if attempts == 3:\n",
+    "            break\n",
+    "        for fn in incomplete_files:\n",
+    "            try:\n",
+    "                archive_urls(fn)\n",
+    "                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n",
+    "            except ConnectionError:\n",
+    "                failed_files.append(fn)\n",
+    "        attempts += 1\n",
+    "        logging.warn('Files that failed: {}'.format(incomplete_files))\n",
+    "        time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n",
+    "\n",
+    "\n",
+    "def dict_from_csv(csv_file):\n",
+    "    result = {}\n",
+    "    if pathlib.Path(csv_file).exists():\n",
+    "        with open(csv_file, 'r') as fn:\n",
+    "            f = csv.reader(fn)\n",
+    "            for row in f:\n",
+    "                result[row[0]] = row[1]\n",
+    "    return result \n",
+    "\n",
+    "    \n",
+    "def get_domain(url):\n",
+    "    domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n",
+    "    if not domain:\n",
+    "        raise ValueError(\"Can't find URL in {url}\")\n",
+    "    return domain\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_urls_from_json(j_obj):\n",
+    "    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n",
+    "    query_url = urlencode_url(j_obj['link'])\n",
+    "    link_urls = []\n",
+    "    \n",
+    "    for x in j_obj['linkElements']:\n",
+    "        url = x['href']\n",
+    "        if re.match('javascript', url) or url == '':\n",
+    "            continue\n",
+    "        link_urls.append(urlencode_url(url))\n",
+    "    return (query_url, link_urls)\n",
+    "    \n",
+    "def urlencode_url(url):\n",
+    "    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n",
+    "\n",
+    "def archive_url(url, \n",
+    "                wait = 2,         \n",
+    "                capture_outlinks = 0 # Whether to capture outlinks (default is no)\n",
+    "                ):\n",
+    "\n",
+    "\n",
+    "\n",
+    "    payload = {'url': url,\n",
+    "              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n",
+    "              #'capture_screenshot': capture_screenshot,\n",
+    "              'capture_outlinks': capture_outlinks\n",
+    "              }\n",
+    "    r = requests.post(ENDPT, headers=HEADERS, data=payload)\n",
+    "    logging.debug(r.content)\n",
+    "    print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n",
+    "    if r.status_code == 429:\n",
+    "        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n",
+    "        time.sleep(wait)\n",
+    "        return archive_url(url = url,\n",
+    "                           wait = wait * 1.2, \n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "    if r.status_code in [104,502,503,504,443,401]:\n",
+    "        logging.warning(url)\n",
+    "        logging.warning(r.text)\n",
+    "        if r.status_code in [104, 401, 443]:\n",
+    "            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n",
+    "            return None\n",
+    "        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n",
+    "        time.sleep(30)\n",
+    "        return archive_url(url = url,\n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "                          \n",
+    "    r.raise_for_status()\n",
+    "    try:\n",
+    "        return r.json()['job_id']\n",
+    "    except KeyError:\n",
+    "        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py3",
+   "language": "python",
+   "name": "py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/search_engine_results/wayback_urls/add_wayback_urls.py b/search_engine_results/wayback_urls/add_wayback_urls.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import time
+import argparse
+import logging
+import csv
+import requests
+import urllib
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-w', help = 'Location of file with wayback URLS')
+    parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)')
+
+    args = parser.parse_args()
+
+
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    wayback_dict = load_wayback_dict(args.w)
+    for fn in files:
+        write_wayback_to_file(fn, args.o, wayback_dict)
+
+
+def write_wayback_to_file(filename, out_dir, wayback_dict):
+    with open(filename, 'r') as f:
+        j_obj = json.load(f)
+        query_url = urlencode_url(j_obj['link'])
+        try:
+            wayback_url = wayback_dict[query_url]
+            j_obj['wayback_url'] = wayback_url
+        except KeyError:
+            logging.error(f"Should have an entry for {query_url}")
+            logging.error(wayback_dict.keys())
+            j_obj['wayback_url'] = ''
+        for link_obj in j_obj['linkElements']:
+            link_url = urlencode_url(link_obj['href'])
+            if link_url == '':
+            try:
+                wayback_url = wayback_dict[link_url]
+                link_obj['wayback_url'] = wayback_url
+            except KeyError:
+                logging.info(f'No WB URL for {link_url}')
+                link_obj['wayback_url'] = ''
+    outfile = get_out_path(filename, out_dir)
+    with open(outfile, 'w') as f:
+        json.dump(j_obj, f)
+
+
+def get_out_path(fp, out_dir):
+    '''Assumes that we want to keep the directory and the file name'''
+    if out_dir == None:
+        return fp
+    else:
+        new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
+        if not new_path.parent.exists():
+            logging.warning(f"Creating new path at {new_path}")
+            new_path.parent.mkdir(parents = True)
+        return new_path
+
+def load_wayback_dict(fn):
+    '''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores
+    the timestamp, overwriting older WB URLs with newer ones'''
+    result = {}
+    if pathlib.Path(fn).exists():
+        with open(fn, 'r') as f_obj:
+            f = csv.reader(f_obj)
+            for row in f:
+                result[row[1]] = row[2]
+    return result
+
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
+