knowledge-graph/dump_to_csv.py at master · Odeuropa/knowledge-graph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from SPARQLWrapper import SPARQLWrapper, JSON, TSV
import os
import glob

from tqdm import tqdm
from rdflib import Graph, RDF, URIRef

import destarify

test_mode = True

if not os.path.isdir('dump-flat'):
    print('destarify in progress')
    destarify.run()


vocabs = 'dump-flat/vocabularies'
gn_dump = 'dump/geonames'


def _default_sparql(endpoint):
    sparql = SPARQLWrapper(endpoint)

    def exec_query(q, format=JSON):
        sparql.setReturnFormat(format)
        sparql.setQuery(q)
        res = sparql.query()
        if format == JSON:
            return res.convert()['results']['bindings']
        else:
            return res.convert().decode('utf-8')

    return exec_query


prefixes = '''
PREFIX schema: <https://schema.org/>
PREFIX od: <http://data.odeuropa.eu/ontology/>
PREFIX crm: <http://erlangen-crm.org/current/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX reo: <https://read-it.acc.hum.uu.nl/ontology#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX time: <http://www.w3.org/2006/time#>
PREFIX gn: <http://www.geonames.org/ontology#>
'''
sparql = _default_sparql('https://data.odeuropa.eu/repositories/odeuropa')

graphs_query = '''
PREFIX dcmi: <http://purl.org/dc/dcmitype/>
SELECT DISTINCT ?g
FROM <http://www.ontotext.com/disable-sameAs>
WHERE {
	?g a dcmi:Dataset
}'''

smells_query = '''
PREFIX od: <http://data.odeuropa.eu/ontology/>
SELECT DISTINCT *
WHERE {
        ?s a od:L11_Smell .
} %s
'''

graphs_list = sparql(graphs_query)

for i, gr in enumerate(sorted([graph['g']['value'] for graph in graphs_list])):
    id = gr.split('/')[-1]
    print(f'* Graph {i}/{len(graphs_list)}: {id}')
    if id in ['image-annotation', 'odor', 'nuk', 'europeana', 'rijksmuseum']:
        continue

    # if id + '.tsv' in os.listdir('out'):
    #     continue

    with open(os.path.join('out', id + '.csv'), 'w') as f:
        first = True

        for x in glob.glob(f'./dump-flat/{id}*'):
            g = Graph()
            for v in os.listdir(vocabs):
                g.parse(os.path.join(vocabs, v))
            for v in os.listdir(gn_dump):
                g.parse(os.path.join(gn_dump, v))

            for y in [t for t in os.listdir(x) if t.endswith('.ttl') and t != 'graph.ttl']:
                g.parse(os.path.join(x, y), format="n3")

            smells_list = [sm for sm in g.subjects(RDF.type, URIRef('http://data.odeuropa.eu/ontology/L11_Smell'))]
            # smells_list = g.query(smells_query %  'LIMIT 100' if test_mode else '')

            for s in tqdm(smells_list):
                q_lang = f'''{prefixes}
                    SELECT DISTINCT ?lang WHERE {{
                    ?book crm:P67_refers_to <{s}> ; schema:inLanguage ?lang }}'''
                lang = [x['lang'] for x in g.query(q_lang)][0]

                q = f'''
                {prefixes}
                select distinct *
                where {{
                    <{s}> rdfs:label ?smell_word .
                    ?emission od:F1_generated <{s}>.
                    OPTIONAL {{
                        ?emission od:F3_had_source ?smell_source .
                        ?smell_source rdfs:label|skos:prefLabel ?smell_source_label .
                        FILTER(LANG(?smell_source_label) = "{lang}")
                    }}
                    OPTIONAL {{
                        ?emission od:F4_had_carrier ?carrier .
                        ?carrier rdfs:label|skos:prefLabel ?carrier_label .
                        FILTER(LANG(?carrier_label) = "{lang}")

                    }}
                    OPTIONAL {{
                        ?emission crm:P7_took_place_at ?place .
                        ?place gn:name | rdfs:label ?place_label .
                        # FILTER(LANG(?place_label) = "{lang}")
                    }}
                    OPTIONAL {{?emission time:hasTime / rdfs:label ?time}}
                    ?experience od:F2_perceived <{s}> .
                    OPTIONAL {{
                        ?experience crm:P14_carried_out_by ?perceiver.
                        ?perceiver rdfs:label ?perceiver_label .
                    }}
                    OPTIONAL {{
                        ?experience od:F6_evoked ?evoked.
                        ?evoked rdfs:label|skos:prefLabel ?evoked_label .
                        FILTER(LANG(?evoked_label) = "{lang}")
                    }}
                    OPTIONAL {{[] crm:P141_assigned ?quality ;
                                 crm:P140_assigned_attribute_to <{s}> ;
                                  rdfs:label ?quality_label .
                    }}
                    OPTIONAL {{?emotion reo:readP27 ?experience ;
                            rdfs:label|skos:prefLabel ?emotion_label
                    }}

                    ?frag crm:P67_refers_to <{s}> ; rdf:value ?sentence.
                    ?book crm:P165_incorporates ?frag ; schema:inLanguage ?lang .

                    VALUES ?smell {{ <{s}> }}
                    VALUES ?g {{ <{gr}> }}
                }}'''
                # print(q)
                props = g.query(q)
                # print(len(props.bindings))
                props = props.serialize(format='csv').decode("utf-8")

                if not first:
                    props = props.split('\n', maxsplit=1)[-1]

                f.write(props)

                first = False