vivo-sample-data/utils.py at master · lawlesst/vivo-sample-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

import csv
import hashlib
import os
import urllib

import logging
logger = logging.getLogger(__name__)

import requests

from rdflib import Graph, Namespace
from rdflib.namespace import NamespaceManager, ClosedNamespace

#Data namespace
DATA_NAMESPACE = os.environ.get('DATA_NAMESPACE') or 'http://vivo.school.edu'

VIVO = Namespace('http://vivoweb.org/ontology/core#')
#FOAF = Namespace('http://xmlns.com/foaf/0.1/')
BIBO = Namespace('http://purl.org/ontology/bibo/')
OBO = Namespace('http://purl.obolibrary.org/obo/')
#SCHEMA = Namespace('http://schema.org/')
#SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')

namespaces = {}
for k, o in vars().items():
    if isinstance(o, (Namespace, ClosedNamespace)):
        namespaces[k] = o

ns_mgr = NamespaceManager(Graph())
for k, v in namespaces.items():
    ns_mgr.bind(k.lower(), v)


#Use when a named graph isn't specified for SPARQL update.
DEFAULT_GRAPH = 'http://vitro.mannlib.cornell.edu/default/vitro-kb-2'

def _env(name):
    val = os.getenv(name)
    if val is None:
        raise Exception("Can't find {}.  Set environment variable.".format(name))
    return val

class VUpdate(object):
    """
    VIVO SPARQL Update class
    """
    def __init__(self):
        self.endpoint = _env('VIVO_UPDATE_ENDPOINT')
        self.email = _env('VIVO_EMAIL')
        self.password = _env('VIVO_PASSWORD')

    def add(self, graph, name=None):
        """
        See:
        https://github.com/RDFLib/rdflib/blob/master/rdflib/plugins/stores/sparqlstore.py#L451
        """
        nameg = name or DEFAULT_GRAPH
        data = ""
        for subject, predicate, obj in graph:
            triple = "%s %s %s .\n" % (subject.n3(), predicate.n3(), obj.n3())
            data += triple
        sparql = "INSERT DATA \n { GRAPH <%s> {\n %s }\n}" % (nameg, data)
        self.do_update(sparql)

    def remove(self, graph, name=None):
        nameg = name or DEFAULT_GRAPH
        data = ""
        for subject, predicate, obj in graph:
            triple = "%s %s %s .\n" % (subject.n3(), predicate.n3(), obj.n3())
            data += triple
        sparql = "DELETE DATA \n { GRAPH <%s> { %s }\n}" % (nameg, data)
        self.do_update(sparql)

    def do_update(self, query):
        logger.debug('Update query:\n {}'.format(query))
        payload = {
            'email': self.email,
            'password': self.password,
            'update': query
        }
        data = urllib.urlencode(payload)
        response = urllib.urlopen(self.endpoint, data)
        #This will raise an expection if something goes wrong
        if response.code != 200:
            raise Exception("SPARQL update failed.  Status code: {}".format(str(response.code)))
        #Verify that we actually hit the API endpoint.  This is hardcoded.  Should read
        #from properties or something.
        if 'api/sparqlUpdate' not in response.url:
            raise Exception("Response URL doesn't seem to be the VIVO API URL.  Verify settings.")
        logger.info("Update response code: {}".format(response.code))
        return True

def hash_uri(raw, prefix='n'):
    """
    Return a hash of the next in numerical form.

    Prefix with the prefix text.
    """
    hobj = hashlib.md5(raw)
    return prefix + hobj.hexdigest()

def scrub_row(row):
    """
    Set values that are empty strings - "" -
    to Python None.

    Remove carriage returns and line breaks from cells.  Encode
    as utf-8.
    """
    out_dictionary = {}
    for k,v in row.items():
        #Remove line breaks and carriage returns.
        v = v.replace('\n', '').replace('\r', '')
        if v == '':
            out_dictionary[k] = None
        else:
            out_dictionary[k] = v.decode('utf-8', 'ignore')
    return out_dictionary

def read_file(file_name, delimiter=','):
    """
    Read in the file and clean the rows.
    """
    out = []
    with open(file_name) as infile:
        for row in csv.DictReader(infile, delimiter=delimiter):
            clean_row = scrub_row(row)
            out.append(clean_row)
    return out


class CrossRefSearchException(Exception):
    pass

def crossref_metadata_search(search_string):
    """
    Search the metadata API.
    """
    base = "http://search.crossref.org/dois?q={0}".format(search_string)
    resp = requests.get(base)
    data = resp.json()
    if len(data) == 0:
        raise CrossRefSearchException("No CR metadata search results")
    else:
        return data