weso
diff --git a/‎doc/input_types.ipynb‎
Lines changed: 319 additions & 0 deletions b/‎doc/input_types.ipynb‎
Lines changed: 319 additions & 0 deletions
@@ -0,0 +1,319 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tgDn9FgB09vu"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install shexer"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "sheXer can handle different types of inputs:\n",
+        "\n",
+        "* Local/remote files.\n",
+        "* in-memory string content.\n",
+        "* SPARQL endpoints.\n",
+        "* Compressed files.\n",
+        "* RDFlib graphs.\n",
+        "\n",
+        "In this notebook, you'll find examples on how to provide such inputs to sheXer\n"
+      ],
+      "metadata": {
+        "id": "8FnmPwt91Dxn"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from shexer.shaper import Shaper\n",
+        "\n",
+        "def default_namespaces():\n",
+        "    return {\"http://example.org/\": \"ex\",\n",
+        "            \"http://www.w3.org/XML/1998/namespace/\": \"xml\",\n",
+        "            \"http://www.w3.org/1999/02/22-rdf-syntax-ns#\": \"rdf\",\n",
+        "            \"http://www.w3.org/2000/01/rdf-schema#\": \"rdfs\",\n",
+        "            \"http://www.w3.org/2001/XMLSchema#\": \"xsd\",\n",
+        "            \"http://xmlns.com/foaf/0.1/\": \"foaf\"\n",
+        "            }"
+      ],
+      "metadata": {
+        "id": "atpnK9s82Q5o"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Getting some shapes for a graph in an str object\n",
+        "\n",
+        "raw_graph_turtle = \"\"\"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
+        "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
+        "@prefix ex: <http://example.org/> .\n",
+        "@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n",
+        "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n",
+        "\n",
+        "ex:Jimmy a foaf:Person ;  # Complete\n",
+        "\tfoaf:age \"23\"^^xsd:integer ;\n",
+        "\tfoaf:name \"Jimmy\" ;\n",
+        "\tfoaf:familyName \"Jones\" .\n",
+        "\n",
+        "ex:Sarah a foaf:Person ;  # Complete implicit type for age\n",
+        "\tfoaf:age 22 ;\n",
+        "\tfoaf:name \"Sarah\" ;\n",
+        "\tfoaf:familyName \"Salem\" .\n",
+        "\n",
+        "ex:Bella a foaf:Person ;  # Missing familyName\n",
+        "\tfoaf:age \"56\"^^xsd:integer ;\n",
+        "\tfoaf:name \"Isabella\" .\n",
+        "\n",
+        "ex:David a foaf:Person ;  # Missing age and use knows\n",
+        "\tfoaf:name \"David\" ;\n",
+        "\tfoaf:familyName \"Doulofeau\" ;\n",
+        "\tfoaf:knows ex:Sarah .\n",
+        "\n",
+        "ex:HumanLike foaf:name \"Person\" ;  # foaf properties, but not explicit type.\n",
+        "\tfoaf:familyName \"Maybe\" ;\n",
+        "\tfoaf:age 99 ;\n",
+        "\tfoaf:knows ex:David .\n",
+        "\n",
+        "\n",
+        "ex:x1 rdf:type foaf:Document ;\n",
+        "\tfoaf:depiction \"A thing that is nice\" ;\n",
+        "\tfoaf:title \"A nice thing\" .\n",
+        "\n",
+        "\n",
+        "ex:x2 rdf:type foaf:Document ;\n",
+        "\tfoaf:title \"Another thing\" .\"\"\"\n",
+        "\n",
+        "from shexer.consts import TURTLE_ITER, TURTLE\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            raw_graph=raw_graph_turtle,  # parameter to pass the input str\n",
+        "            namespaces_dict=default_namespaces(),  # some namespaces to pretify the result\n",
+        "            all_classes_mode=True,  # get a shape for each class with instances\n",
+        "            input_format=TURTLE_ITER)  # input format should be indicated. Available options in shexer.const\n",
+        "                                       # TURTLE_ITER is turtle, but the parser used in implemented within sheXer\n",
+        "                                       # TURTLE input format in available. Such option uses rdflib's parser\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)\n"
+      ],
+      "metadata": {
+        "id": "pXcChiaB7gUU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Same case, same content. But now the input is in a remote file\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            graph_file_input=\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\",  # parameter to pass a path for a remote file\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=TURTLE)  # using rdflib's parser, some more namespaces appear in the results (precharged ones in an rdflig.Graph() object)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "k-nBQyCr9m6i"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Same case, but now the file is local\n",
+        "\n",
+        "import requests\n",
+        "\n",
+        "def remote_to_local(url, local_path):\n",
+        "  response = requests.get(url)\n",
+        "  if response.status_code == 200:\n",
+        "      with open(local_path, \"w\", encoding=\"utf-8\") as out_stream:\n",
+        "          out_stream.write(response.text)\n",
+        "\n",
+        "remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\",\n",
+        "                \"local_file.ttl\")\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            graph_file_input=\"./local_file.ttl\",  # the parameter for local and remote files is the same\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=TURTLE)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "R1tORPF5_VKy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Same case, but now the parsed content if N-Triples instead. This format uses an internal parser too\n",
+        "from shexer.consts import NT\n",
+        "\n",
+        "remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\",\n",
+        "                \"local_file.nt\")\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            graph_file_input=\"./local_file.nt\",  # the parameter for local and remote files is the same\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=NT)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "yJNPlCTWFmc_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#Same case, but now the content is split in two different files that should be parsed as a single dataset\n",
+        "\n",
+        "remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt\",\n",
+        "                \"local_file.nt\")\n",
+        "\n",
+        "with open(\"local_file.nt\") as in_stream:\n",
+        "  lines = in_stream.readlines()\n",
+        "  with open(\"local_file_pt1.nt\", \"w\") as out_1:  # Writing 7 first lines to \"local_file_pt1.nt\"\n",
+        "    out_1.write(\"\".join(lines[:7]))\n",
+        "  with open(\"local_file_pt2.nt\", \"w\") as out_2: # Writing the rest of lines to to \"local_file_pt2.nt\"\n",
+        "    out_2.write(\"\".join(lines[7:]))\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            graph_list_of_files_input=[\"./local_file_pt1.nt\", \"./local_file_pt2.nt\"],  # Use this parameter to parse a list of files instead of a single file\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=NT)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)\n"
+      ],
+      "metadata": {
+        "id": "e6dWseCJGCEE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Same case, but now we process an rdflib graph object\n",
+        "\n",
+        "from rdflib import Graph\n",
+        "\n",
+        "remote_to_local(\"https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl\",\n",
+        "                \"local_file.ttl\")\n",
+        "\n",
+        "g = Graph()\n",
+        "g.parse(\"./local_file.ttl\")\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            rdflib_graph=g,  # Use this parameter for rdflib inputs\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=TURTLE)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "NabS5L4u0kuc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Same target content, but not it is compressed in a ZIP file\n",
+        "# In this case,t he file consist of a single file. But in case the file\n",
+        "# zips more files, the setting would be the same and all files woul be parsed\n",
+        "# parsed as a single data source\n",
+        "\n",
+        "from shexer.consts  import ZIP, TURTLE_ITER\n",
+        "import urllib.request\n",
+        "\n",
+        "urllib.request.urlretrieve(\"https://github.com/weso/shexer/raw/refs/heads/master/test/t_files/compression/t_graph_1.ttl.zip\",\n",
+        "                           \"local_file.zip\")\n",
+        "\n",
+        "shaper = Shaper(\n",
+        "            graph_file_input=\"./local_file.zip\",  # as it is a file input, we still use this parameter to declare the path\n",
+        "            namespaces_dict=default_namespaces(),\n",
+        "            all_classes_mode=True,\n",
+        "            input_format=TURTLE_ITER,\n",
+        "            compression_mode=ZIP)\n",
+        "\n",
+        "str_result = shaper.shex_graph(string_output=True)\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "_6gQPxMa2G-2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Example to generate a shape for some nodes exposed in DBpedia endpoint.\n",
+        "# With this setting, only 4 nodes (result of the SPARQL query) will be used\n",
+        "# as example. Only exploring direct connections with those nodes. No other\n",
+        "# chape than the specified in the shape map will be generated\n",
+        "\n",
+        "\n",
+        "\n",
+        "shape_map_raw = \"SPARQL'select ?s where {?s a <http://dbpedia.org/ontology/Person>} LIMIT 4'@<ShapePerson>\"\n",
+        "shaper = Shaper(shape_map_raw=shape_map_raw,  # indicates target shapes and nodes as a shape map\n",
+        "                url_endpoint=\"https://dbpedia.org/sparql\",  # target url\n",
+        "                namespaces_dict=default_namespaces(),  #some namespaces to pretify results.\n",
+        "                depth_for_building_subgraph=1,  # distance fo exploration from seed nodes\n",
+        "                track_classes_for_entities_at_last_depth_level=False, # no exception for the previous rule\n",
+        "                all_classes_mode=False)  # no class out of the content specified in the shape map will generate a shape\n",
+        "str_result = shaper.shex_graph(string_output=True,\n",
+        "                               acceptance_threshold=0.9)  # Only accept very frequent observations\n",
+        "print(str_result)"
+      ],
+      "metadata": {
+        "id": "S0fEjGgf1RQX"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}