Skip to content

Commit c74632a

Browse files
Add some example notebooks
1 parent 62c11e6 commit c74632a

File tree

2 files changed

+562
-0
lines changed

2 files changed

+562
-0
lines changed

doc/input_types.ipynb

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"execution_count": null,
20+
"metadata": {
21+
"id": "tgDn9FgB09vu"
22+
},
23+
"outputs": [],
24+
"source": [
25+
"!pip install shexer"
26+
]
27+
},
28+
{
29+
"cell_type": "markdown",
30+
"source": [
31+
"sheXer can handle different types of inputs:\n",
32+
"\n",
33+
"* Local/remote files.\n",
34+
"* in-memory string content.\n",
35+
"* SPARQL endpoints.\n",
36+
"* Compressed files.\n",
37+
"* RDFlib graphs.\n",
38+
"\n",
39+
"In this notebook, you'll find examples on how to provide such inputs to sheXer\n"
40+
],
41+
"metadata": {
42+
"id": "8FnmPwt91Dxn"
43+
}
44+
},
45+
{
46+
"cell_type": "code",
47+
"source": [
48+
"from shexer.shaper import Shaper\n",
49+
"\n",
50+
"def default_namespaces():\n",
51+
" return {\"http://example.org/\": \"ex\",\n",
52+
" \"http://www.w3.org/XML/1998/namespace/\": \"xml\",\n",
53+
" \"http://www.w3.org/1999/02/22-rdf-syntax-ns#\": \"rdf\",\n",
54+
" \"http://www.w3.org/2000/01/rdf-schema#\": \"rdfs\",\n",
55+
" \"http://www.w3.org/2001/XMLSchema#\": \"xsd\",\n",
56+
" \"http://xmlns.com/foaf/0.1/\": \"foaf\"\n",
57+
" }"
58+
],
59+
"metadata": {
60+
"id": "atpnK9s82Q5o"
61+
},
62+
"execution_count": 3,
63+
"outputs": []
64+
},
65+
{
66+
"cell_type": "code",
67+
"source": [
68+
"# Getting some shapes for a graph in an str object\n",
69+
"\n",
70+
"raw_graph_turtle = \"\"\"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
71+
"@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
72+
"@prefix ex: <http://example.org/> .\n",
73+
"@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n",
74+
"@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n",
75+
"\n",
76+
"ex:Jimmy a foaf:Person ; # Complete\n",
77+
"\tfoaf:age \"23\"^^xsd:integer ;\n",
78+
"\tfoaf:name \"Jimmy\" ;\n",
79+
"\tfoaf:familyName \"Jones\" .\n",
80+
"\n",
81+
"ex:Sarah a foaf:Person ; # Complete implicit type for age\n",
82+
"\tfoaf:age 22 ;\n",
83+
"\tfoaf:name \"Sarah\" ;\n",
84+
"\tfoaf:familyName \"Salem\" .\n",
85+
"\n",
86+
"ex:Bella a foaf:Person ; # Missing familyName\n",
87+
"\tfoaf:age \"56\"^^xsd:integer ;\n",
88+
"\tfoaf:name \"Isabella\" .\n",
89+
"\n",
90+
"ex:David a foaf:Person ; # Missing age and use knows\n",
91+
"\tfoaf:name \"David\" ;\n",
92+
"\tfoaf:familyName \"Doulofeau\" ;\n",
93+
"\tfoaf:knows ex:Sarah .\n",
94+
"\n",
95+
"ex:HumanLike foaf:name \"Person\" ; # foaf properties, but not explicit type.\n",
96+
"\tfoaf:familyName \"Maybe\" ;\n",
97+
"\tfoaf:age 99 ;\n",
98+
"\tfoaf:knows ex:David .\n",
99+
"\n",
100+
"\n",
101+
"ex:x1 rdf:type foaf:Document ;\n",
102+
"\tfoaf:depiction \"A thing that is nice\" ;\n",
103+
"\tfoaf:title \"A nice thing\" .\n",
104+
"\n",
105+
"\n",
106+
"ex:x2 rdf:type foaf:Document ;\n",
107+
"\tfoaf:title \"Another thing\" .\"\"\"\n",
108+
"\n",
109+
"from shexer.consts import TURTLE_ITER, TURTLE\n",
110+
"\n",
111+
"shaper = Shaper(\n",
112+
" raw_graph=raw_graph_turtle, # parameter to pass the input str\n",
113+
" namespaces_dict=default_namespaces(), # some namespaces to pretify the result\n",
114+
" all_classes_mode=True, # get a shape for each class with instances\n",
115+
" input_format=TURTLE_ITER) # input format should be indicated. Available options in shexer.const\n",
116+
" # TURTLE_ITER is turtle, but the parser used in implemented within sheXer\n",
117+
" # TURTLE input format in available. Such option uses rdflib's parser\n",
118+
"\n",
119+
"str_result = shaper.shex_graph(string_output=True)\n",
120+
"print(str_result)\n"
121+
],
122+
"metadata": {
123+
"id": "pXcChiaB7gUU"
124+
},
125+
"execution_count": null,
126+
"outputs": []
127+
},
128+
{
129+
"cell_type": "code",
130+
"source": [
131+
"# Same case, same content. But now the input is in a remote file\n",
132+
"\n",
133+
"shaper = Shaper(\n",
134+
" graph_file_input=\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\", # parameter to pass a path for a remote file\n",
135+
" namespaces_dict=default_namespaces(),\n",
136+
" all_classes_mode=True,\n",
137+
" input_format=TURTLE) # using rdflib's parser, some more namespaces appear in the results (precharged ones in an rdflig.Graph() object)\n",
138+
"\n",
139+
"str_result = shaper.shex_graph(string_output=True)\n",
140+
"print(str_result)"
141+
],
142+
"metadata": {
143+
"id": "k-nBQyCr9m6i"
144+
},
145+
"execution_count": null,
146+
"outputs": []
147+
},
148+
{
149+
"cell_type": "code",
150+
"source": [
151+
"# Same case, but now the file is local\n",
152+
"\n",
153+
"import requests\n",
154+
"\n",
155+
"def remote_to_local(url, local_path):\n",
156+
" response = requests.get(url)\n",
157+
" if response.status_code == 200:\n",
158+
" with open(local_path, \"w\", encoding=\"utf-8\") as out_stream:\n",
159+
" out_stream.write(response.text)\n",
160+
"\n",
161+
"remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\",\n",
162+
" \"local_file.ttl\")\n",
163+
"\n",
164+
"shaper = Shaper(\n",
165+
" graph_file_input=\"./local_file.ttl\", # the parameter for local and remote files is the same\n",
166+
" namespaces_dict=default_namespaces(),\n",
167+
" all_classes_mode=True,\n",
168+
" input_format=TURTLE)\n",
169+
"\n",
170+
"str_result = shaper.shex_graph(string_output=True)\n",
171+
"print(str_result)"
172+
],
173+
"metadata": {
174+
"id": "R1tORPF5_VKy"
175+
},
176+
"execution_count": null,
177+
"outputs": []
178+
},
179+
{
180+
"cell_type": "code",
181+
"source": [
182+
"# Same case, but now the parsed content if N-Triples instead. This format uses an internal parser too\n",
183+
"from shexer.consts import NT\n",
184+
"\n",
185+
"remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\",\n",
186+
" \"local_file.nt\")\n",
187+
"\n",
188+
"shaper = Shaper(\n",
189+
" graph_file_input=\"./local_file.nt\", # the parameter for local and remote files is the same\n",
190+
" namespaces_dict=default_namespaces(),\n",
191+
" all_classes_mode=True,\n",
192+
" input_format=NT)\n",
193+
"\n",
194+
"str_result = shaper.shex_graph(string_output=True)\n",
195+
"print(str_result)"
196+
],
197+
"metadata": {
198+
"id": "yJNPlCTWFmc_"
199+
},
200+
"execution_count": null,
201+
"outputs": []
202+
},
203+
{
204+
"cell_type": "code",
205+
"source": [
206+
"#Same case, but now the content is split in two different files that should be parsed as a single dataset\n",
207+
"\n",
208+
"remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\",\n",
209+
" \"local_file.nt\")\n",
210+
"\n",
211+
"with open(\"local_file.nt\") as in_stream:\n",
212+
" lines = in_stream.readlines()\n",
213+
" with open(\"local_file_pt1.nt\", \"w\") as out_1: # Writing 7 first lines to \"local_file_pt1.nt\"\n",
214+
" out_1.write(\"\".join(lines[:7]))\n",
215+
" with open(\"local_file_pt2.nt\", \"w\") as out_2: # Writing the rest of lines to to \"local_file_pt2.nt\"\n",
216+
" out_2.write(\"\".join(lines[7:]))\n",
217+
"\n",
218+
"shaper = Shaper(\n",
219+
" graph_list_of_files_input=[\"./local_file_pt1.nt\", \"./local_file_pt2.nt\"], # Use this parameter to parse a list of files instead of a single file\n",
220+
" namespaces_dict=default_namespaces(),\n",
221+
" all_classes_mode=True,\n",
222+
" input_format=NT)\n",
223+
"\n",
224+
"str_result = shaper.shex_graph(string_output=True)\n",
225+
"print(str_result)\n"
226+
],
227+
"metadata": {
228+
"id": "e6dWseCJGCEE"
229+
},
230+
"execution_count": null,
231+
"outputs": []
232+
},
233+
{
234+
"cell_type": "code",
235+
"source": [
236+
"# Same case, but now we process an rdflib graph object\n",
237+
"\n",
238+
"from rdflib import Graph\n",
239+
"\n",
240+
"remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\",\n",
241+
" \"local_file.ttl\")\n",
242+
"\n",
243+
"g = Graph()\n",
244+
"g.parse(\"./local_file.ttl\")\n",
245+
"\n",
246+
"shaper = Shaper(\n",
247+
" rdflib_graph=g, # Use this parameter for rdflib inputs\n",
248+
" namespaces_dict=default_namespaces(),\n",
249+
" all_classes_mode=True,\n",
250+
" input_format=TURTLE)\n",
251+
"\n",
252+
"str_result = shaper.shex_graph(string_output=True)\n",
253+
"print(str_result)"
254+
],
255+
"metadata": {
256+
"id": "NabS5L4u0kuc"
257+
},
258+
"execution_count": null,
259+
"outputs": []
260+
},
261+
{
262+
"cell_type": "code",
263+
"source": [
264+
"# Same target content, but not it is compressed in a ZIP file\n",
265+
"# In this case,t he file consist of a single file. But in case the file\n",
266+
"# zips more files, the setting would be the same and all files woul be parsed\n",
267+
"# parsed as a single data source\n",
268+
"\n",
269+
"from shexer.consts import ZIP, TURTLE_ITER\n",
270+
"import urllib.request\n",
271+
"\n",
272+
"urllib.request.urlretrieve(\"https://github.com/weso/shexer/raw/refs/heads/master/test/t_files/compression/t_graph_1.ttl.zip\",\n",
273+
" \"local_file.zip\")\n",
274+
"\n",
275+
"shaper = Shaper(\n",
276+
" graph_file_input=\"./local_file.zip\", # as it is a file input, we still use this parameter to declare the path\n",
277+
" namespaces_dict=default_namespaces(),\n",
278+
" all_classes_mode=True,\n",
279+
" input_format=TURTLE_ITER,\n",
280+
" compression_mode=ZIP)\n",
281+
"\n",
282+
"str_result = shaper.shex_graph(string_output=True)\n",
283+
"print(str_result)"
284+
],
285+
"metadata": {
286+
"id": "_6gQPxMa2G-2"
287+
},
288+
"execution_count": null,
289+
"outputs": []
290+
},
291+
{
292+
"cell_type": "code",
293+
"source": [
294+
"# Example to generate a shape for some nodes exposed in DBpedia endpoint.\n",
295+
"# With this setting, only 4 nodes (result of the SPARQL query) will be used\n",
296+
"# as example. Only exploring direct connections with those nodes. No other\n",
297+
"# chape than the specified in the shape map will be generated\n",
298+
"\n",
299+
"\n",
300+
"\n",
301+
"shape_map_raw = \"SPARQL'select ?s where {?s a <http://dbpedia.org/ontology/Person>} LIMIT 4'@<ShapePerson>\"\n",
302+
"shaper = Shaper(shape_map_raw=shape_map_raw, # indicates target shapes and nodes as a shape map\n",
303+
" url_endpoint=\"https://dbpedia.org/sparql\", # target url\n",
304+
" namespaces_dict=default_namespaces(), #some namespaces to pretify results.\n",
305+
" depth_for_building_subgraph=1, # distance fo exploration from seed nodes\n",
306+
" track_classes_for_entities_at_last_depth_level=False, # no exception for the previous rule\n",
307+
" all_classes_mode=False) # no class out of the content specified in the shape map will generate a shape\n",
308+
"str_result = shaper.shex_graph(string_output=True,\n",
309+
" acceptance_threshold=0.9) # Only accept very frequent observations\n",
310+
"print(str_result)"
311+
],
312+
"metadata": {
313+
"id": "S0fEjGgf1RQX"
314+
},
315+
"execution_count": null,
316+
"outputs": []
317+
}
318+
]
319+
}

0 commit comments

Comments
 (0)