-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdump_to_csv.py
More file actions
155 lines (126 loc) · 5.33 KB
/
dump_to_csv.py
File metadata and controls
155 lines (126 loc) · 5.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from SPARQLWrapper import SPARQLWrapper, JSON, TSV
import os
import glob
from tqdm import tqdm
from rdflib import Graph, RDF, URIRef
import destarify
test_mode = True
if not os.path.isdir('dump-flat'):
print('destarify in progress')
destarify.run()
vocabs = 'dump-flat/vocabularies'
gn_dump = 'dump/geonames'
def _default_sparql(endpoint):
sparql = SPARQLWrapper(endpoint)
def exec_query(q, format=JSON):
sparql.setReturnFormat(format)
sparql.setQuery(q)
res = sparql.query()
if format == JSON:
return res.convert()['results']['bindings']
else:
return res.convert().decode('utf-8')
return exec_query
prefixes = '''
PREFIX schema: <https://schema.org/>
PREFIX od: <http://data.odeuropa.eu/ontology/>
PREFIX crm: <http://erlangen-crm.org/current/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX reo: <https://read-it.acc.hum.uu.nl/ontology#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX time: <http://www.w3.org/2006/time#>
PREFIX gn: <http://www.geonames.org/ontology#>
'''
sparql = _default_sparql('https://data.odeuropa.eu/repositories/odeuropa')
graphs_query = '''
PREFIX dcmi: <http://purl.org/dc/dcmitype/>
SELECT DISTINCT ?g
FROM <http://www.ontotext.com/disable-sameAs>
WHERE {
?g a dcmi:Dataset
}'''
smells_query = '''
PREFIX od: <http://data.odeuropa.eu/ontology/>
SELECT DISTINCT *
WHERE {
?s a od:L11_Smell .
} %s
'''
graphs_list = sparql(graphs_query)
for i, gr in enumerate(sorted([graph['g']['value'] for graph in graphs_list])):
id = gr.split('/')[-1]
print(f'* Graph {i}/{len(graphs_list)}: {id}')
if id in ['image-annotation', 'odor', 'nuk', 'europeana', 'rijksmuseum']:
continue
# if id + '.tsv' in os.listdir('out'):
# continue
with open(os.path.join('out', id + '.csv'), 'w') as f:
first = True
for x in glob.glob(f'./dump-flat/{id}*'):
g = Graph()
for v in os.listdir(vocabs):
g.parse(os.path.join(vocabs, v))
for v in os.listdir(gn_dump):
g.parse(os.path.join(gn_dump, v))
for y in [t for t in os.listdir(x) if t.endswith('.ttl') and t != 'graph.ttl']:
g.parse(os.path.join(x, y), format="n3")
smells_list = [sm for sm in g.subjects(RDF.type, URIRef('http://data.odeuropa.eu/ontology/L11_Smell'))]
# smells_list = g.query(smells_query % 'LIMIT 100' if test_mode else '')
for s in tqdm(smells_list):
q_lang = f'''{prefixes}
SELECT DISTINCT ?lang WHERE {{
?book crm:P67_refers_to <{s}> ; schema:inLanguage ?lang }}'''
lang = [x['lang'] for x in g.query(q_lang)][0]
q = f'''
{prefixes}
select distinct *
where {{
<{s}> rdfs:label ?smell_word .
?emission od:F1_generated <{s}>.
OPTIONAL {{
?emission od:F3_had_source ?smell_source .
?smell_source rdfs:label|skos:prefLabel ?smell_source_label .
FILTER(LANG(?smell_source_label) = "{lang}")
}}
OPTIONAL {{
?emission od:F4_had_carrier ?carrier .
?carrier rdfs:label|skos:prefLabel ?carrier_label .
FILTER(LANG(?carrier_label) = "{lang}")
}}
OPTIONAL {{
?emission crm:P7_took_place_at ?place .
?place gn:name | rdfs:label ?place_label .
# FILTER(LANG(?place_label) = "{lang}")
}}
OPTIONAL {{?emission time:hasTime / rdfs:label ?time}}
?experience od:F2_perceived <{s}> .
OPTIONAL {{
?experience crm:P14_carried_out_by ?perceiver.
?perceiver rdfs:label ?perceiver_label .
}}
OPTIONAL {{
?experience od:F6_evoked ?evoked.
?evoked rdfs:label|skos:prefLabel ?evoked_label .
FILTER(LANG(?evoked_label) = "{lang}")
}}
OPTIONAL {{[] crm:P141_assigned ?quality ;
crm:P140_assigned_attribute_to <{s}> ;
rdfs:label ?quality_label .
}}
OPTIONAL {{?emotion reo:readP27 ?experience ;
rdfs:label|skos:prefLabel ?emotion_label
}}
?frag crm:P67_refers_to <{s}> ; rdf:value ?sentence.
?book crm:P165_incorporates ?frag ; schema:inLanguage ?lang .
VALUES ?smell {{ <{s}> }}
VALUES ?g {{ <{gr}> }}
}}'''
# print(q)
props = g.query(q)
# print(len(props.bindings))
props = props.serialize(format='csv').decode("utf-8")
if not first:
props = props.split('\n', maxsplit=1)[-1]
f.write(props)
first = False