1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " code" ,
19+ "execution_count" : null ,
20+ "metadata" : {
21+ "id" : " tgDn9FgB09vu"
22+ },
23+ "outputs" : [],
24+ "source" : [
25+ " !pip install shexer"
26+ ]
27+ },
28+ {
29+ "cell_type" : " markdown" ,
30+ "source" : [
31+ " sheXer can handle different types of inputs:\n " ,
32+ " \n " ,
33+ " * Local/remote files.\n " ,
34+ " * in-memory string content.\n " ,
35+ " * SPARQL endpoints.\n " ,
36+ " * Compressed files.\n " ,
37+ " * RDFlib graphs.\n " ,
38+ " \n " ,
39+ " In this notebook, you'll find examples on how to provide such inputs to sheXer\n "
40+ ],
41+ "metadata" : {
42+ "id" : " 8FnmPwt91Dxn"
43+ }
44+ },
45+ {
46+ "cell_type" : " code" ,
47+ "source" : [
48+ " from shexer.shaper import Shaper\n " ,
49+ " \n " ,
50+ " def default_namespaces():\n " ,
51+ " return {\" http://example.org/\" : \" ex\" ,\n " ,
52+ " \" http://www.w3.org/XML/1998/namespace/\" : \" xml\" ,\n " ,
53+ " \" http://www.w3.org/1999/02/22-rdf-syntax-ns#\" : \" rdf\" ,\n " ,
54+ " \" http://www.w3.org/2000/01/rdf-schema#\" : \" rdfs\" ,\n " ,
55+ " \" http://www.w3.org/2001/XMLSchema#\" : \" xsd\" ,\n " ,
56+ " \" http://xmlns.com/foaf/0.1/\" : \" foaf\"\n " ,
57+ " }"
58+ ],
59+ "metadata" : {
60+ "id" : " atpnK9s82Q5o"
61+ },
62+ "execution_count" : 3 ,
63+ "outputs" : []
64+ },
65+ {
66+ "cell_type" : " code" ,
67+ "source" : [
68+ " # Getting some shapes for a graph in an str object\n " ,
69+ " \n " ,
70+ " raw_graph_turtle = \"\"\" @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n " ,
71+ " @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n " ,
72+ " @prefix ex: <http://example.org/> .\n " ,
73+ " @prefix foaf: <http://xmlns.com/foaf/0.1/> .\n " ,
74+ " @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n " ,
75+ " \n " ,
76+ " ex:Jimmy a foaf:Person ; # Complete\n " ,
77+ " \t foaf:age \" 23\" ^^xsd:integer ;\n " ,
78+ " \t foaf:name \" Jimmy\" ;\n " ,
79+ " \t foaf:familyName \" Jones\" .\n " ,
80+ " \n " ,
81+ " ex:Sarah a foaf:Person ; # Complete implicit type for age\n " ,
82+ " \t foaf:age 22 ;\n " ,
83+ " \t foaf:name \" Sarah\" ;\n " ,
84+ " \t foaf:familyName \" Salem\" .\n " ,
85+ " \n " ,
86+ " ex:Bella a foaf:Person ; # Missing familyName\n " ,
87+ " \t foaf:age \" 56\" ^^xsd:integer ;\n " ,
88+ " \t foaf:name \" Isabella\" .\n " ,
89+ " \n " ,
90+ " ex:David a foaf:Person ; # Missing age and use knows\n " ,
91+ " \t foaf:name \" David\" ;\n " ,
92+ " \t foaf:familyName \" Doulofeau\" ;\n " ,
93+ " \t foaf:knows ex:Sarah .\n " ,
94+ " \n " ,
95+ " ex:HumanLike foaf:name \" Person\" ; # foaf properties, but not explicit type.\n " ,
96+ " \t foaf:familyName \" Maybe\" ;\n " ,
97+ " \t foaf:age 99 ;\n " ,
98+ " \t foaf:knows ex:David .\n " ,
99+ " \n " ,
100+ " \n " ,
101+ " ex:x1 rdf:type foaf:Document ;\n " ,
102+ " \t foaf:depiction \" A thing that is nice\" ;\n " ,
103+ " \t foaf:title \" A nice thing\" .\n " ,
104+ " \n " ,
105+ " \n " ,
106+ " ex:x2 rdf:type foaf:Document ;\n " ,
107+ " \t foaf:title \" Another thing\" .\"\"\"\n " ,
108+ " \n " ,
109+ " from shexer.consts import TURTLE_ITER, TURTLE\n " ,
110+ " \n " ,
111+ " shaper = Shaper(\n " ,
112+ " raw_graph=raw_graph_turtle, # parameter to pass the input str\n " ,
113+ " namespaces_dict=default_namespaces(), # some namespaces to pretify the result\n " ,
114+ " all_classes_mode=True, # get a shape for each class with instances\n " ,
115+ " input_format=TURTLE_ITER) # input format should be indicated. Available options in shexer.const\n " ,
116+ " # TURTLE_ITER is turtle, but the parser used in implemented within sheXer\n " ,
117+ " # TURTLE input format in available. Such option uses rdflib's parser\n " ,
118+ " \n " ,
119+ " str_result = shaper.shex_graph(string_output=True)\n " ,
120+ " print(str_result)\n "
121+ ],
122+ "metadata" : {
123+ "id" : " pXcChiaB7gUU"
124+ },
125+ "execution_count" : null ,
126+ "outputs" : []
127+ },
128+ {
129+ "cell_type" : " code" ,
130+ "source" : [
131+ " # Same case, same content. But now the input is in a remote file\n " ,
132+ " \n " ,
133+ " shaper = Shaper(\n " ,
134+ " graph_file_input=\" https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\" , # parameter to pass a path for a remote file\n " ,
135+ " namespaces_dict=default_namespaces(),\n " ,
136+ " all_classes_mode=True,\n " ,
137+ " input_format=TURTLE) # using rdflib's parser, some more namespaces appear in the results (precharged ones in an rdflig.Graph() object)\n " ,
138+ " \n " ,
139+ " str_result = shaper.shex_graph(string_output=True)\n " ,
140+ " print(str_result)"
141+ ],
142+ "metadata" : {
143+ "id" : " k-nBQyCr9m6i"
144+ },
145+ "execution_count" : null ,
146+ "outputs" : []
147+ },
148+ {
149+ "cell_type" : " code" ,
150+ "source" : [
151+ " # Same case, but now the file is local\n " ,
152+ " \n " ,
153+ " import requests\n " ,
154+ " \n " ,
155+ " def remote_to_local(url, local_path):\n " ,
156+ " response = requests.get(url)\n " ,
157+ " if response.status_code == 200:\n " ,
158+ " with open(local_path, \" w\" , encoding=\" utf-8\" ) as out_stream:\n " ,
159+ " out_stream.write(response.text)\n " ,
160+ " \n " ,
161+ " remote_to_local(\" https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\" ,\n " ,
162+ " \" local_file.ttl\" )\n " ,
163+ " \n " ,
164+ " shaper = Shaper(\n " ,
165+ " graph_file_input=\" ./local_file.ttl\" , # the parameter for local and remote files is the same\n " ,
166+ " namespaces_dict=default_namespaces(),\n " ,
167+ " all_classes_mode=True,\n " ,
168+ " input_format=TURTLE)\n " ,
169+ " \n " ,
170+ " str_result = shaper.shex_graph(string_output=True)\n " ,
171+ " print(str_result)"
172+ ],
173+ "metadata" : {
174+ "id" : " R1tORPF5_VKy"
175+ },
176+ "execution_count" : null ,
177+ "outputs" : []
178+ },
179+ {
180+ "cell_type" : " code" ,
181+ "source" : [
182+ " # Same case, but now the parsed content if N-Triples instead. This format uses an internal parser too\n " ,
183+ " from shexer.consts import NT\n " ,
184+ " \n " ,
185+ " remote_to_local(\" https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\" ,\n " ,
186+ " \" local_file.nt\" )\n " ,
187+ " \n " ,
188+ " shaper = Shaper(\n " ,
189+ " graph_file_input=\" ./local_file.nt\" , # the parameter for local and remote files is the same\n " ,
190+ " namespaces_dict=default_namespaces(),\n " ,
191+ " all_classes_mode=True,\n " ,
192+ " input_format=NT)\n " ,
193+ " \n " ,
194+ " str_result = shaper.shex_graph(string_output=True)\n " ,
195+ " print(str_result)"
196+ ],
197+ "metadata" : {
198+ "id" : " yJNPlCTWFmc_"
199+ },
200+ "execution_count" : null ,
201+ "outputs" : []
202+ },
203+ {
204+ "cell_type" : " code" ,
205+ "source" : [
206+ " #Same case, but now the content is split in two different files that should be parsed as a single dataset\n " ,
207+ " \n " ,
208+ " remote_to_local(\" https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\" ,\n " ,
209+ " \" local_file.nt\" )\n " ,
210+ " \n " ,
211+ " with open(\" local_file.nt\" ) as in_stream:\n " ,
212+ " lines = in_stream.readlines()\n " ,
213+ " with open(\" local_file_pt1.nt\" , \" w\" ) as out_1: # Writing 7 first lines to \" local_file_pt1.nt\"\n " ,
214+ " out_1.write(\"\" .join(lines[:7]))\n " ,
215+ " with open(\" local_file_pt2.nt\" , \" w\" ) as out_2: # Writing the rest of lines to to \" local_file_pt2.nt\"\n " ,
216+ " out_2.write(\"\" .join(lines[7:]))\n " ,
217+ " \n " ,
218+ " shaper = Shaper(\n " ,
219+ " graph_list_of_files_input=[\" ./local_file_pt1.nt\" , \" ./local_file_pt2.nt\" ], # Use this parameter to parse a list of files instead of a single file\n " ,
220+ " namespaces_dict=default_namespaces(),\n " ,
221+ " all_classes_mode=True,\n " ,
222+ " input_format=NT)\n " ,
223+ " \n " ,
224+ " str_result = shaper.shex_graph(string_output=True)\n " ,
225+ " print(str_result)\n "
226+ ],
227+ "metadata" : {
228+ "id" : " e6dWseCJGCEE"
229+ },
230+ "execution_count" : null ,
231+ "outputs" : []
232+ },
233+ {
234+ "cell_type" : " code" ,
235+ "source" : [
236+ " # Same case, but now we process an rdflib graph object\n " ,
237+ " \n " ,
238+ " from rdflib import Graph\n " ,
239+ " \n " ,
240+ " remote_to_local(\" https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\" ,\n " ,
241+ " \" local_file.ttl\" )\n " ,
242+ " \n " ,
243+ " g = Graph()\n " ,
244+ " g.parse(\" ./local_file.ttl\" )\n " ,
245+ " \n " ,
246+ " shaper = Shaper(\n " ,
247+ " rdflib_graph=g, # Use this parameter for rdflib inputs\n " ,
248+ " namespaces_dict=default_namespaces(),\n " ,
249+ " all_classes_mode=True,\n " ,
250+ " input_format=TURTLE)\n " ,
251+ " \n " ,
252+ " str_result = shaper.shex_graph(string_output=True)\n " ,
253+ " print(str_result)"
254+ ],
255+ "metadata" : {
256+ "id" : " NabS5L4u0kuc"
257+ },
258+ "execution_count" : null ,
259+ "outputs" : []
260+ },
261+ {
262+ "cell_type" : " code" ,
263+ "source" : [
264+ " # Same target content, but not it is compressed in a ZIP file\n " ,
265+ " # In this case,t he file consist of a single file. But in case the file\n " ,
266+ " # zips more files, the setting would be the same and all files woul be parsed\n " ,
267+ " # parsed as a single data source\n " ,
268+ " \n " ,
269+ " from shexer.consts import ZIP, TURTLE_ITER\n " ,
270+ " import urllib.request\n " ,
271+ " \n " ,
272+ " urllib.request.urlretrieve(\" https://github.com/weso/shexer/raw/refs/heads/master/test/t_files/compression/t_graph_1.ttl.zip\" ,\n " ,
273+ " \" local_file.zip\" )\n " ,
274+ " \n " ,
275+ " shaper = Shaper(\n " ,
276+ " graph_file_input=\" ./local_file.zip\" , # as it is a file input, we still use this parameter to declare the path\n " ,
277+ " namespaces_dict=default_namespaces(),\n " ,
278+ " all_classes_mode=True,\n " ,
279+ " input_format=TURTLE_ITER,\n " ,
280+ " compression_mode=ZIP)\n " ,
281+ " \n " ,
282+ " str_result = shaper.shex_graph(string_output=True)\n " ,
283+ " print(str_result)"
284+ ],
285+ "metadata" : {
286+ "id" : " _6gQPxMa2G-2"
287+ },
288+ "execution_count" : null ,
289+ "outputs" : []
290+ },
291+ {
292+ "cell_type" : " code" ,
293+ "source" : [
294+ " # Example to generate a shape for some nodes exposed in DBpedia endpoint.\n " ,
295+ " # With this setting, only 4 nodes (result of the SPARQL query) will be used\n " ,
296+ " # as example. Only exploring direct connections with those nodes. No other\n " ,
297+ " # chape than the specified in the shape map will be generated\n " ,
298+ " \n " ,
299+ " \n " ,
300+ " \n " ,
301+ " shape_map_raw = \" SPARQL'select ?s where {?s a <http://dbpedia.org/ontology/Person>} LIMIT 4'@<ShapePerson>\"\n " ,
302+ " shaper = Shaper(shape_map_raw=shape_map_raw, # indicates target shapes and nodes as a shape map\n " ,
303+ " url_endpoint=\" https://dbpedia.org/sparql\" , # target url\n " ,
304+ " namespaces_dict=default_namespaces(), #some namespaces to pretify results.\n " ,
305+ " depth_for_building_subgraph=1, # distance fo exploration from seed nodes\n " ,
306+ " track_classes_for_entities_at_last_depth_level=False, # no exception for the previous rule\n " ,
307+ " all_classes_mode=False) # no class out of the content specified in the shape map will generate a shape\n " ,
308+ " str_result = shaper.shex_graph(string_output=True,\n " ,
309+ " acceptance_threshold=0.9) # Only accept very frequent observations\n " ,
310+ " print(str_result)"
311+ ],
312+ "metadata" : {
313+ "id" : " S0fEjGgf1RQX"
314+ },
315+ "execution_count" : null ,
316+ "outputs" : []
317+ }
318+ ]
319+ }
0 commit comments