Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions search_engine_results/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"\n",
"import pathlib\n",
"import pprint\n",
"import json\n",
"import re\n",
"import requests\n",
"import time\n",
"import config\n",
"import argparse\n",
"import logging\n",
"import urllib.parse\n",
"import csv\n",
"from datetime import datetime\n",
"\n",
"\n",
"ENDPT = 'https://web.archive.org/save/'\n",
"UA_STRING = config.UA_STRING\n",
"ACCESS_KEY = config.ACCESS_KEY\n",
"SECRET_KEY = config.SECRET_KEY\n",
"HEADERS = {'Accept':'application/json',\n",
" 'User-Agent': UA_STRING,\n",
" 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n",
"IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def main():\n",
" parser = argparse.ArgumentParser(description='Creates job ')\n",
" parser.add_argument('-i', help='Input directory with metadata files')\n",
" parser.add_argument('-o', help='Location to save job id file')\n",
" ## TODO: Maybe switch this so default is to ignore?\n",
" parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n",
" action='store_true')\n",
"\n",
" args = parser.parse_args()\n",
"\n",
" \n",
" # Make a list of the files that we are going to be editing (skip those already edited)\n",
" files = pathlib.Path(args.i).glob('**/*.json')\n",
" ## FOR TESTING ONLY!!!\n",
" #files = list(files)[10:11]\n",
" archive_files(files, args.o, args.ignore_self_links)\n",
" \n",
"def archive_files(files, output_file, ignore_self_links):\n",
" \n",
" \n",
" def get_urls_to_archive(fn):\n",
" '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n",
" with open(filename, 'r') as f:\n",
" j_obj = json.load(f)\n",
" # Get the URLs from the file\n",
" query_url, link_urls = get_urls_from_json(j_obj)\n",
" # Filter out the self links and search engine cache urls\n",
" link_urls = filter_link_urls(query_url, link_urls)\n",
" \n",
" with open(output_file, 'w') as out_file:\n",
" f = csv.writer(out_file)\n",
" # Get outlinks for the query URL. This gets these jobs started early, so some will\n",
" # hopefully be done by the time we make the calls\n",
" query_job = archive_url(query_url, capture_outlinks=1)\n",
" store_job_id(f, query_url, query_job)\n",
" for url in link_urls:\n",
" job_id = archive_url(url)\n",
" store_job_id(f, url, job_id)\n",
" \n",
" def store_job_id(f, url, job_id):\n",
" '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n",
" time = datetime.now()\n",
" f.writerow([time, url, job_id])\n",
" completed_urls[url] = job_id\n",
" \n",
" def filter_link_urls(query_url,\n",
" urls,\n",
" remove_cache=True):\n",
" '''\n",
" Takes link urls and filters them in three ways:\n",
" 1. (Optionally) Ignores urls from the two caches:\n",
" webcache.googleusercontent.com\n",
" https://cc.bingj\n",
" 2. Filters out those which are in the completed_urls dictionary\n",
" 3. (Optionally) Identifies URLs which have the same domain as the query URL.\n",
" Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n",
" that we want it archived and move it from skipped to the to_archive list\n",
" '''\n",
" to_archive = []\n",
" if ignore_self_links:\n",
" domain = get_domain(query_url)\n",
" else:\n",
" domain = None\n",
" cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n",
" for url in urls:\n",
" if url in completed_urls:\n",
" continue\n",
"\n",
" if remove_cache == True:\n",
" if re.match(cache_regex, url):\n",
" continue\n",
"\n",
" if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n",
" # If it matches, check if it's in skipped URLs\n",
" # If so, remove it from there, and add it to the to_archive list\n",
" if url in skipped_urls:\n",
" to_archive.append(url)\n",
" skipped_urls.remove(url)\n",
" # Else, add it to the skipped urls (and skip it)\n",
" else:\n",
" skipped_urls.append(url)\n",
" else:\n",
" to_archive.append(url)\n",
" return to_archive\n",
" \n",
" \n",
" completed_urls = dict_from_csv(output_file)\n",
" skipped_urls = []\n",
" attempts = 0\n",
" incomplete_files = list(files)\n",
" while len(incomplete_files) > 0:\n",
" if attempts == 3:\n",
" break\n",
" for fn in incomplete_files:\n",
" try:\n",
" archive_urls(fn)\n",
" incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n",
" except ConnectionError:\n",
" failed_files.append(fn)\n",
" attempts += 1\n",
" logging.warn('Files that failed: {}'.format(incomplete_files))\n",
" time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n",
"\n",
"\n",
"def dict_from_csv(csv_file):\n",
" result = {}\n",
" if pathlib.Path(csv_file).exists():\n",
" with open(csv_file, 'r') as fn:\n",
" f = csv.reader(fn)\n",
" for row in f:\n",
" result[row[0]] = row[1]\n",
" return result \n",
"\n",
" \n",
"def get_domain(url):\n",
" domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n",
" if not domain:\n",
" raise ValueError(\"Can't find URL in {url}\")\n",
" return domain\n",
"\n",
"\n",
"\n",
"def get_urls_from_json(j_obj):\n",
" '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n",
" query_url = urlencode_url(j_obj['link'])\n",
" link_urls = []\n",
" \n",
" for x in j_obj['linkElements']:\n",
" url = x['href']\n",
" if re.match('javascript', url) or url == '':\n",
" continue\n",
" link_urls.append(urlencode_url(url))\n",
" return (query_url, link_urls)\n",
" \n",
"def urlencode_url(url):\n",
" return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n",
"\n",
"def archive_url(url, \n",
" wait = 2, \n",
" capture_outlinks = 0 # Whether to capture outlinks (default is no)\n",
" ):\n",
"\n",
"\n",
"\n",
" payload = {'url': url,\n",
" 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n",
" #'capture_screenshot': capture_screenshot,\n",
" 'capture_outlinks': capture_outlinks\n",
" }\n",
" r = requests.post(ENDPT, headers=HEADERS, data=payload)\n",
" logging.debug(r.content)\n",
" print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
"\n",
" if r.status_code == 429:\n",
" logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n",
" time.sleep(wait)\n",
" return archive_url(url = url,\n",
" wait = wait * 1.2, \n",
" capture_outlinks = capture_outlinks)\n",
" if r.status_code in [104,502,503,504,443,401]:\n",
" logging.warning(url)\n",
" logging.warning(r.text)\n",
" if r.status_code in [104, 401, 443]:\n",
" logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n",
" return None\n",
" logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n",
" time.sleep(30)\n",
" return archive_url(url = url,\n",
" capture_outlinks = capture_outlinks)\n",
" \n",
" r.raise_for_status()\n",
" try:\n",
" return r.json()['job_id']\n",
" except KeyError:\n",
" logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py3",
"language": "python",
"name": "py3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
86 changes: 86 additions & 0 deletions search_engine_results/wayback_urls/add_wayback_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python
# coding: utf-8

import pathlib
import json
import re
import time
import argparse
import logging
import csv
import requests
import urllib



def main():
parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
parser.add_argument('-i', help='Input directory with metadata files')
parser.add_argument('-w', help = 'Location of file with wayback URLS')
parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)')

args = parser.parse_args()


# Make a list of the files that we are going to be editing (skip those already edited)
files = pathlib.Path(args.i).glob('**/*.json')
wayback_dict = load_wayback_dict(args.w)
for fn in files:
write_wayback_to_file(fn, args.o, wayback_dict)


def write_wayback_to_file(filename, out_dir, wayback_dict):
with open(filename, 'r') as f:
j_obj = json.load(f)
query_url = urlencode_url(j_obj['link'])
try:
wayback_url = wayback_dict[query_url]
j_obj['wayback_url'] = wayback_url
except KeyError:
logging.error(f"Should have an entry for {query_url}")
logging.error(wayback_dict.keys())
j_obj['wayback_url'] = ''
for link_obj in j_obj['linkElements']:
link_url = urlencode_url(link_obj['href'])
if link_url == '':
try:
wayback_url = wayback_dict[link_url]
link_obj['wayback_url'] = wayback_url
except KeyError:
logging.info(f'No WB URL for {link_url}')
link_obj['wayback_url'] = ''
outfile = get_out_path(filename, out_dir)
with open(outfile, 'w') as f:
json.dump(j_obj, f)


def get_out_path(fp, out_dir):
'''Assumes that we want to keep the directory and the file name'''
if out_dir == None:
return fp
else:
new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
if not new_path.parent.exists():
logging.warning(f"Creating new path at {new_path}")
new_path.parent.mkdir(parents = True)
return new_path

def load_wayback_dict(fn):
'''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores
the timestamp, overwriting older WB URLs with newer ones'''
result = {}
if pathlib.Path(fn).exists():
with open(fn, 'r') as f_obj:
f = csv.reader(f_obj)
for row in f:
result[row[1]] = row[2]
return result


def urlencode_url(url):
return requests.utils.requote_uri(urllib.parse.unquote_plus(url))

if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()

Loading