asalinks-workshop/harvest.py at master · wragge/asalinks-workshop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from datetime import datetime
from trove_python.trove_core import trove
from trove_python.trove_harvest.harvest import TroveHarvester
import credentials
import time
import csv
from urllib import urlretrieve
import json
import re
import os


FIELDS = ['work_id', 'version_id', 'title', 'creator', 'description', 'trove_url', 'repository_url', 'image_url']
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))


# We're creating our own version of the TroveHarvest included in the trove-python package
class UMAHarvester(TroveHarvester):

    def process_results(self, results):
        '''
        Processes a page full of results.
        Extracts versions from work level records and saves them separately.
        '''
        works = results[0]['records']['work']
        # We're going to write the data to a CSV file.
        # Note that we're processing one page of results at a time,
        # so we have to append these results, not write a new filr.
        with open('data/records.csv', 'ab') as csvfile:
            writer = csv.writer(csvfile)
            for work in works:
                # Unpack the 'versions' (multiple photos can be grouped together)
                versions = work['version']
                for version in versions:
                    row = []
                    row.append(work['id'])
                    version_id = version['id'].split()[0]
                    row.append(version_id)
                    # The actual record metadata is buried a few levels down...
                    # This depends on the record source, so if you're harvesting from somewhere else,
                    # you might need to adjust the path to the data.
                    metadata = version['record'][0]['metadata']['qualifieddc']
                    row.append(metadata['title'])
                    row.append(metadata['creator'])
                    # There can be multiple descriptions -- join them with a ';' in between.
                    description = '; '.join([text.replace('\n', ' ') for text in metadata['description']])
                    row.append(description)
                    # Create the permanent link back to Trove
                    row.append('http://trove.nla.gov.au/version/{}'.format(version_id))
                    # Loop through identifier field to get links
                    for link in metadata['identifier']:
                        # 'fulltext' is a link back to the repository
                        if link['linktype'] == 'fulltext':
                            row.append(link['value'])
                        # 'thumbnail' is a link to the image
                        elif link['linktype'] == 'thumbnail':
                            image_url = link['value']
                            row.append(image_url)
                    writer.writerow(row)
                    image_filename = 'images/{}.jpg'.format(version_id)
                    # Save the image
                    urlretrieve(image_url, image_filename)
        # This line updates the main harvester so it knows where it's up to.
        self.harvested += self.get_highest_n(results)
        print 'Harvested: {}'.format(self.harvested)
        time.sleep(0.5)


def save_json():
    '''Takes the results.csv file generated by the harvest and saves data as json.'''
    records = []
    with open('data/records.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        reader.next()
        for row in reader:
            record = {}
            for index, field in enumerate(row):
                record[FIELDS[index]] = field
            records.append(record)
    with open('data/records.json', 'wb') as jsonfile:
        json.dump(records, jsonfile)


def save_text():
    with open('data/records.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        reader.next()
        with open('data/descriptions.txt', 'wb') as descriptions:
            for row in reader:
                title = row[2].replace('&quot;', "'")
                description = row[4].replace('&quot;', "'")
                descriptions.write('{}\n'.format(title))
                # Add summary descrition - remove content after Previous Control Number
                try:
                    truncate = description.index('Previous Control')
                except ValueError:
                    truncate = -1
                if truncate != -1:
                    description = description[:truncate]
                descriptions.write('{}\n'.format(description))


def save_dates():
    total = 0
    years = {}
    with open('data/records.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        reader.next()
        for row in reader:
            try:
                year = int(re.search(r'\b(19\d{2})\b', row[2]).group(1))
                try:
                    years[year] += 1
                except KeyError:
                    years[year] = 1
                total += 1
            except AttributeError:
                pass
    with open('data/years.csv', 'wb') as csvfile:
        for year in sorted(years):
            print '{} - {}'.format(year, years[year])
            writer = csv.writer(csvfile)
            writer.writerow([year, years[year]])
    print total


def save_faces():
    faces = []
    with open('data/records.json', 'rb') as data:
        records = json.load(data)
        for file in os.listdir(os.path.join(CURRENT_DIR, 'faces')):
            if file[-3:] == 'jpg':
                version_id = file.split('-')[0]
                record = (item for item in records if item["version_id"] == version_id).next()
                face = {
                    'image': file,
                    'title': record['title'],
                    'link': record['repository_url']
                }
                faces.append(face)
    with open('data/faces.json', 'wb') as faces_json:
        json.dump(faces, faces_json)


def do_harvest():
    # This is the query for the John Ellis photos as University of Melbourne Archives.
    # Of course you could change this query to harvest other things.
    # However, the actual structure of the record might be a bit different.
    query = 'http://api.trove.nla.gov.au/result?q=nuc:"VUMA"+"Ellis,+John"&zone=picture&n=100&encoding=json&reclevel=full&include=workVersions&key={}'.format(credentials.TROVE_API_KEY)
    trove_api = trove.Trove(credentials.TROVE_API_KEY)
    harvester = UMAHarvester(trove_api, query=query, number=100)
    harvester.harvest()