Merge branch 'enhancement-#190'

DaniFdezAlvarez · DaniFdezAlvarez · commit 68454d2d8bdb · 2025-05-27T21:09:00.000+02:00
diff --git a/setup.py b/setup.py
@@ -8,12 +8,12 @@ def read(file_path):
 setup(
   name = 'shexer',
   packages = find_packages(exclude=["*.local_code.*"]), # this must be the same as the name above
-  version = '2.6.2',
+  version = '2.6.3',
   description = 'Automatic schema extraction for RDF graphs',
   author = 'Daniel Fernandez-Alvarez',
   author_email = 'danifdezalvarez@gmail.com',
   url = 'https://github.com/DaniFdezAlvarez/shexer',
-  download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.6.2.tar.gz',
+  download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.6.3.tar.gz',
   keywords = ['testing', 'shexer', 'shexerp3', "rdf", "shex", "shacl", "schema"],
   long_description = read('README.md'),
   long_description_content_type='text/markdown',
diff --git a/shexer/io/graph/yielder/big_ttl_triples_yielder.py b/shexer/io/graph/yielder/big_ttl_triples_yielder.py
@@ -1,5 +1,6 @@
 from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
 from shexer.utils.uri import remove_corners, unprefixize_uri_mandatory
+from shexer.utils.literal import find_next_unescaped_quotes
 from shexer.utils.triple_yielders import tune_subj, tune_prop, tune_token
 import re
 
@@ -68,7 +69,7 @@ def __init__(self, source_file=None, allow_untyped_numbers=True, raw_graph=None,
     def yield_triples(self):
         self._reset_parsing()
         for a_line in self._read_normalized_lines():
-            for a_triple in self._process_line_2(a_line):
+            for a_triple in self._process_line(a_line):
                 self._triples_count += 1
                 yield (
                     tune_subj(a_triple[_S],
@@ -89,7 +90,7 @@ def _clean_line(self, str_line):
 
     def _remove_comments_if_needed(self, str_line):
         """Remove comments in the middle of the line.
-        Lines starting with # wont be erased
+        Lines starting with # won't be erased
         """
         if '"' not in str_line:  # Comment mark and no literals, trivial case
             return str_line[:str_line.find(" #")]
@@ -108,7 +109,7 @@ def _remove_comments_if_needed(self, str_line):
         return str_line  # If this point is reached, it means that the potential comments
                          # are actual content of a string literal
 
-    def _process_line_2(self, str_line):
+    def _process_line(self, str_line):
         str_line = self._clean_line(str_line)
         if str_line == "":
             self._process_empty_line(str_line)
@@ -150,6 +151,8 @@ def _assing_tmp_element_and_promote_state(self, token):
             self._state = _WAITING_FOR_OBJ
         elif self._state == _WAITING_FOR_OBJ:
             self._tmp_o = self._parse_elem(token)
+            if self._tmp_o.startswith('"'):
+                self._tmp_o = self._tmp_o.replace('\\\\"','\\"')
             self._state = _NOT_WAITING
         else:
             raise ValueError("Malformed file. Processing an unexpected token: " + token)
@@ -177,85 +180,16 @@ def _find_next_blank(self, target_str, start_index):
         pos = target_str.find(" ", start_index)
         return len(target_str)-1 if pos == -1 else pos
 
-
-    def _find_next_unescaped_quotes(self, target_str, start_index):
-        pos = target_str.find('"', start_index)
-        while pos != -1:
-            if target_str[pos-1] != "\\":
-                return pos  # not escaped
-            # if pos >= 2 and target_str[pos-2] == '\\':
-            #     return pos  # the scape is scaped, so not escaped
-            if self._count_prior_backslashes(an_str=target_str,
-                                             quote_pos=pos) % 2 == 0:
-                return pos # the scape is scaped, so not escaped
-            pos = target_str.find('"', pos+1)
-        if pos == -1:
-            raise ValueError("Is this line malformed? Can`t find quotes matching: " + target_str)
-
-    def _count_prior_backslashes(self, an_str, quote_pos):
-        """
-        We assume that there is at least a backslash at an_str[pos-1], so pos-1 is a non-negative index of an_str
-        """
-        counter = 1
-        quote_pos -= 2
-        while quote_pos >= 0:
-            if an_str[quote_pos] == "\\":
-                counter += 1
-            else:
-                return counter
-            quote_pos -= 1
-        return counter
-
-
     def _find_next_quoted_literal_ending(self, target_str, start_index):
-        next_quotes = self._find_next_unescaped_quotes(target_str=target_str,
-                                                       start_index=start_index+1)
+        next_quotes = find_next_unescaped_quotes(target_str=target_str,
+                                                 start_index=start_index+1)
         if next_quotes +1 > len(target_str) or target_str[next_quotes + 1] == " ":
             return next_quotes
         elif target_str[next_quotes + 1] in _SPECIAL_CHARS_AFTER_QUOTES:
             return self._find_next_blank(target_str, next_quotes) - 1
         else:
             raise ValueError("Malformed literal? It seems like there is a problem of unmatching quotes: " + target_str)
 
-    def _process_line(self, str_line):
-        str_line = self._clean_line(str_line)
-        if str_line == "":
-            self._process_empty_line(str_line)
-        elif '"' in str_line:
-            self._process_line_with_literal(str_line)
-        elif str_line.startswith("@prefix"):
-            self._process_prefix_line(str_line)
-        elif str_line.startswith("@base"):
-            self._process_base_line(str_line)
-        elif str_line.startswith("#"):
-            self._process_comment_line(str_line)
-        elif str_line[-1] in [",", ".", ";"]:
-            if ", " in str_line[:-1]:
-                # If there is a comma in a URI, it can't be followed by a blank
-                self._process_multi_triple_line_commas(str_line)
-            else:
-                self._process_single_triple_line(str_line)
-        elif " " not in str_line:
-            if len(str_line) > 1:  # We are ensuring that this is not a single char, such as "," or "."
-                self._process_isolated_subject(str_line)
-        else:
-            self._process_unknown_line(str_line)
-
-    def _process_line_with_literal(self, line):
-        first_quotes_index = line.find('"')
-        s_o_line = line[:first_quotes_index].strip()
-        s_o_pieces = s_o_line.split(" ")
-        if len(s_o_pieces) == 2:
-            self._tmp_s = self._parse_elem(s_o_pieces[0])
-            self._tmp_p = self._parse_elem(s_o_pieces[1])
-        elif len(s_o_pieces) == 1 and s_o_pieces[0] != "":
-            self._tmp_p = self._parse_elem(s_o_pieces[0])
-        # The last char MUST be in [,.;] since this lines comes stripped.
-        # SO everything between first_quotes_index and line[-1], stripped
-        # should be out target literal (typed or not)
-        self._tmp_o = line[first_quotes_index:-1].rstrip()
-        self._decide_current_triple()
-
     def _process_prefix_line(self, line):
         pieces = line.split(" ")
         prefix = pieces[1] if not pieces[1].endswith(":") else pieces[1][: - 1]
@@ -316,7 +250,7 @@ def _process_single_triple_line(self, line):
         self._decide_current_triple()
 
     def _process_isolated_subject(self, line):
-        # No splitt. Line is expected to contain a line with no blanks (isolated subject)
+        # No split. Line is expected to contain a line with no blanks (isolated subject)
         self._tmp_s = self._parse_elem(line)
         # No need to decide triple now, incomplete element
 
@@ -361,7 +295,7 @@ def _parse_elem(self, raw_elem):
                                              prefix_namespaces_dict=self._prefixes)
         elif raw_elem in _BOOLEANS or self._is_num_literal(raw_elem):
             return raw_elem
-            # else?? shouldnt happen, let it break with a nullpoitner
+            # else?? shouldn't happen, let it break with a nullpoitner
 
     def _parse_cornered_element(self, cornered_element):
         if self._base is None:
@@ -398,13 +332,13 @@ def _read_normalized_lines(self):
             if not waiting and '"""' not in a_line:
                 yield a_line
             elif waiting and '"""' not in a_line:
-                tmp += self._scape_quotes_in_normalized_line(a_line)
+                tmp += "\\n" + self._scape_quotes_in_normalized_line(a_line)
             elif not waiting and '"""' in a_line:
                 waiting = True
                 tmp = self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
             elif waiting and '"""' in a_line:
                 waiting = False
-                yield tmp + self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
+                yield tmp + "\\n" + self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
                 tmp = ''
 
     def _scape_quotes_in_normalized_line(self, target):
diff --git a/shexer/io/graph/yielder/nt_triples_yielder.py b/shexer/io/graph/yielder/nt_triples_yielder.py
@@ -1,5 +1,5 @@
 from shexer.utils.log import log_msg
-from shexer.utils.uri import there_is_arroba_after_last_quotes
+from shexer.utils.literal import there_is_arroba_after_last_quotes
 from shexer.utils.triple_yielders import tune_prop, tune_token  # , check_if_property_belongs_to_namespace_list
 from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
 
diff --git a/shexer/io/graph/yielder/rdflib_triple_yielder.py b/shexer/io/graph/yielder/rdflib_triple_yielder.py
@@ -7,7 +7,7 @@
 from shexer.model.bnode import BNode as model_BNode
 from shexer.model.property import Property as model_Property
 
-from shexer.utils.uri import decide_literal_type
+from shexer.utils.literal import decide_literal_type
 from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file, get_content_xz_file
 
 _SUPPORTED_FORMATS = [N3, TURTLE, RDF_XML, NT, JSON_LD]
diff --git a/shexer/io/rdfconfig/formater/rdfconfig_serializer.py b/shexer/io/rdfconfig/formater/rdfconfig_serializer.py
@@ -109,6 +109,7 @@ def _shape_tag_for_var_name(self, class_uri):
             else:
                 last_piece = last_piece[last_piece[:-1].rfind("/") + 1:]
         return last_piece
+
     def _create_subject_name_for_shape(self, shape_uri):
         shape_uri.replace("_", "")
         shape_uri.replace("-", "")
@@ -171,7 +172,8 @@ def _serialize_constraint(self, shape, constraint):
                 elif not example_cons.startswith('"'):
                     example_cons = f'"{example_cons}"'
             if len(example_cons) >= 2:
-                example_cons =  example_cons[0] +  example_cons[1:-1].replace('"', '\\"') + example_cons[-1]
+                # example_cons =  example_cons[0] +  example_cons[1:-1].replace('"', '\\"') + example_cons[-1]
+                example_cons = example_cons[0] + example_cons[1:-1] + example_cons[-1]
             self._write_shape_line(indentation=_PROPERTY_INDENT_LEVEL,
                                    content=f"{st_property}:")
             self._write_shape_line(indentation=_CONSTRAINT_INDENT_LEVEL,
diff --git a/shexer/io/shacl/formater/shacl_serializer.py b/shexer/io/shacl/formater/shacl_serializer.py
@@ -2,7 +2,7 @@
 from shexer.model.shape import STARTING_CHAR_FOR_SHAPE_NAME
 from rdflib import Graph, Namespace, URIRef, RDF, BNode, XSD, Literal
 from shexer.model.statement import POSITIVE_CLOSURE, KLEENE_CLOSURE, OPT_CARDINALITY
-from shexer.utils.uri import XSD_NAMESPACE, LANG_STRING_TYPE
+from shexer.utils.literal import XSD_NAMESPACE, LANG_STRING_TYPE
 from shexer.model.const_elem_types import IRI_ELEM_TYPE, LITERAL_ELEM_TYPE, DOT_ELEM_TYPE, BNODE_ELEM_TYPE
 from shexer.io.wikidata import wikidata_annotation
 from wlighter import TURTLE_FORMAT
diff --git a/shexer/io/shape_map/node_selector/node_selector_parser.py b/shexer/io/shape_map/node_selector/node_selector_parser.py
@@ -1,5 +1,6 @@
 
-from shexer.utils.uri import remove_corners, add_corners, RDF_TYPE
+from shexer.utils.uri import remove_corners, add_corners
+from shexer.utils.literal import RDF_TYPE
 from shexer.model.node_selector import NodeSelectorNoSparql, NodeSelectorSparql
 from rdflib.plugins import sparql
 import re
diff --git a/shexer/shaper.py b/shexer/shaper.py
@@ -286,16 +286,14 @@ def shex_graph(self, string_output=False,
                                                        rdfconfig_directory=rdfconfig_directory,
                                                        verbose=verbose)
             current_result = serializer.serialize_shapes()
-        if current_result is None:
-            current_result = ""
 
         if string_output or output_file is not None:
             log_msg(verbose=verbose,
                     msg="Generating text serialization...")
             serializer = self._build_shapes_serializer(target_file=output_file,
                                                        string_return=string_output,
                                                        output_format=output_format,
-                                                       rdfconfig_directory=rdfconfig_directory,
+                                                       rdfconfig_directory=None,
                                                        verbose=verbose)
             res = serializer.serialize_shapes()
             if string_output:
diff --git a/shexer/utils/literal.py b/shexer/utils/literal.py
@@ -0,0 +1,89 @@
+
+XSD_NAMESPACE = "http://www.w3.org/2001/XMLSchema#"
+XSD_PREFIX = "xsd"
+
+RDF_SYNTAX_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+RDF_PREFIX = "rdf"
+RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
+
+DT_NAMESPACE = "http://dbpedia.org/datatype/"
+DT_PREFIX = "dt"
+
+OPENGIS_NAMESPACE = "http://www.opengis.net/ont/geosparql#"
+OPENGIS_PREFIX = "geo"
+
+LANG_STRING_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"
+STRING_TYPE = "http://www.w3.org/2001/XMLSchema#string"
+FLOAT_TYPE = "http://www.w3.org/2001/XMLSchema#float"
+INTEGER_TYPE = "http://www.w3.org/2001/XMLSchema#integer"
+
+from shexer.model.shape import STARTING_CHAR_FOR_SHAPE_NAME
+
+def _count_prior_backslashes(an_str, quote_pos):
+    """
+    We assume that there is at least a backslash at an_str[pos-1], so pos-1 is a non-negative index of an_str
+    """
+    counter = 1
+    quote_pos -= 2
+    while quote_pos >= 0:
+        if an_str[quote_pos] == "\\":
+            counter += 1
+        else:
+            return counter
+        quote_pos -= 1
+    return counter
+
+def find_next_unescaped_quotes(target_str, start_index):
+    pos = target_str.find('"', start_index)
+    while pos != -1:
+        if target_str[pos - 1] != "\\":
+            return pos  # not escaped
+        # if pos >= 2 and target_str[pos-2] == '\\':
+        #     return pos  # the scape is scaped, so not escaped
+        if _count_prior_backslashes(an_str=target_str,
+                                    quote_pos=pos) % 2 == 0:
+            return pos  # the scape is scaped, so not escaped
+        pos = target_str.find('"', pos + 1)
+    if pos == -1:
+        raise ValueError("Is this line malformed? Can`t find quotes matching: " + target_str)
+
+def there_is_arroba_after_last_quotes(target_str):
+    if target_str.rfind(STARTING_CHAR_FOR_SHAPE_NAME) > target_str.rfind('"'):
+        return True
+    return False
+
+def decide_literal_type(a_literal, base_namespace=None):
+    if there_is_arroba_after_last_quotes(a_literal):
+        return LANG_STRING_TYPE
+    elif "\"^^" not in a_literal:
+        return STRING_TYPE
+    elif "xsd:" in a_literal:
+        return XSD_NAMESPACE + a_literal[a_literal.find("xsd:") + 4:]
+    elif "rdf:" in a_literal:
+        return RDF_SYNTAX_NAMESPACE + a_literal[a_literal.find("rdf:")+ 4:]
+    elif "dt:" in a_literal:
+        return DT_NAMESPACE + a_literal[a_literal.find("dt:")+ 3:]
+    elif "geo:" in a_literal:
+        return OPENGIS_NAMESPACE + a_literal[a_literal.find("geo:") + 4:]
+    elif XSD_NAMESPACE in a_literal or RDF_SYNTAX_NAMESPACE in a_literal \
+            or DT_NAMESPACE in a_literal or OPENGIS_NAMESPACE in a_literal:
+        return a_literal[a_literal.find("\"^^")+4:-1]
+    elif a_literal.strip().endswith(">"):
+        candidate_type = a_literal[a_literal.find("\"^^") + 4:-1]  # plain uri, no corners
+        if base_namespace is not None and not candidate_type.startswith("http"):
+            return base_namespace + candidate_type
+        return candidate_type
+    else:
+        raise RuntimeError("Unrecognized literal type:" + a_literal)
+
+def parse_literal(an_elem, base_namespace=None):
+    closing_quotes = find_next_unescaped_quotes(an_elem, 1)
+    content = an_elem[1:closing_quotes].replace("\\\"", "\"")
+    elem_type = decide_literal_type(a_literal=an_elem,
+                                    base_namespace=base_namespace)
+    return content, elem_type
+
+def parse_unquoted_literal(an_elem):
+    elem_type = decide_literal_type(an_elem)
+    return an_elem, elem_type
+
diff --git a/shexer/utils/structures/dicts.py b/shexer/utils/structures/dicts.py
@@ -70,12 +70,16 @@ def _get_constraint_example_inverse(self, shape_id, prop, inverse):
     def _set_constraint_example_no_inverse(self, shape_id, prop_id, example):
         if shape_id not in self._base_dict:
             self._init_shape(shape_id)
-        self._base_dict[shape_id][_PROP_FEATURES_POS][prop_id] = example
+        self._base_dict[shape_id][_PROP_FEATURES_POS][prop_id] = self._normalize_example(example)
 
     def _set_constraint_example_inverse(self, shape_id, prop_id, example, inverse):
         if shape_id not in self._base_dict:
             self._init_shape(shape_id)
-        self._base_dict[shape_id][_PROP_FEATURES_POS][_POS_INVERSE if inverse else _POS_DIRECT][prop_id] = example
+        self._base_dict[shape_id][_PROP_FEATURES_POS][_POS_INVERSE if inverse else _POS_DIRECT][prop_id] = self._normalize_example(example)
+
+    def _normalize_example(self, example):
+        result = example.replace("\n", "\\n")
+        return result.replace("\"", '\\"')
 
     def _has_constraint_example_no_inverse(self, shape_id, prop_id):
         if shape_id not in self._base_dict:
diff --git a/shexer/utils/triple_yielders.py b/shexer/utils/triple_yielders.py
@@ -2,7 +2,8 @@
 from shexer.model.property import Property
 from shexer.model.Literal import Literal
 from shexer.model.bnode import BNode
-from shexer.utils.uri import remove_corners, parse_literal, parse_unquoted_literal, FLOAT_TYPE, INTEGER_TYPE
+from shexer.utils.uri import remove_corners
+from shexer.utils.literal import parse_literal, parse_unquoted_literal, FLOAT_TYPE, INTEGER_TYPE
 
 
 def check_if_property_belongs_to_namespace_list(str_prop, namespaces):
diff --git a/shexer/utils/uri.py b/shexer/utils/uri.py