Skip to content

Commit 68454d2

Browse files
Merge branch 'enhancement-#190'
2 parents 123c8f4 + 589ef21 commit 68454d2

File tree

12 files changed

+120
-153
lines changed

12 files changed

+120
-153
lines changed

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ def read(file_path):
88
setup(
99
name = 'shexer',
1010
packages = find_packages(exclude=["*.local_code.*"]), # this must be the same as the name above
11-
version = '2.6.2',
11+
version = '2.6.3',
1212
description = 'Automatic schema extraction for RDF graphs',
1313
author = 'Daniel Fernandez-Alvarez',
1414
author_email = 'danifdezalvarez@gmail.com',
1515
url = 'https://github.com/DaniFdezAlvarez/shexer',
16-
download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.6.2.tar.gz',
16+
download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.6.3.tar.gz',
1717
keywords = ['testing', 'shexer', 'shexerp3', "rdf", "shex", "shacl", "schema"],
1818
long_description = read('README.md'),
1919
long_description_content_type='text/markdown',

shexer/io/graph/yielder/big_ttl_triples_yielder.py

Lines changed: 12 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
22
from shexer.utils.uri import remove_corners, unprefixize_uri_mandatory
3+
from shexer.utils.literal import find_next_unescaped_quotes
34
from shexer.utils.triple_yielders import tune_subj, tune_prop, tune_token
45
import re
56

@@ -68,7 +69,7 @@ def __init__(self, source_file=None, allow_untyped_numbers=True, raw_graph=None,
6869
def yield_triples(self):
6970
self._reset_parsing()
7071
for a_line in self._read_normalized_lines():
71-
for a_triple in self._process_line_2(a_line):
72+
for a_triple in self._process_line(a_line):
7273
self._triples_count += 1
7374
yield (
7475
tune_subj(a_triple[_S],
@@ -89,7 +90,7 @@ def _clean_line(self, str_line):
8990

9091
def _remove_comments_if_needed(self, str_line):
9192
"""Remove comments in the middle of the line.
92-
Lines starting with # wont be erased
93+
Lines starting with # won't be erased
9394
"""
9495
if '"' not in str_line: # Comment mark and no literals, trivial case
9596
return str_line[:str_line.find(" #")]
@@ -108,7 +109,7 @@ def _remove_comments_if_needed(self, str_line):
108109
return str_line # If this point is reached, it means that the potential comments
109110
# are actual content of a string literal
110111

111-
def _process_line_2(self, str_line):
112+
def _process_line(self, str_line):
112113
str_line = self._clean_line(str_line)
113114
if str_line == "":
114115
self._process_empty_line(str_line)
@@ -150,6 +151,8 @@ def _assing_tmp_element_and_promote_state(self, token):
150151
self._state = _WAITING_FOR_OBJ
151152
elif self._state == _WAITING_FOR_OBJ:
152153
self._tmp_o = self._parse_elem(token)
154+
if self._tmp_o.startswith('"'):
155+
self._tmp_o = self._tmp_o.replace('\\\\"','\\"')
153156
self._state = _NOT_WAITING
154157
else:
155158
raise ValueError("Malformed file. Processing an unexpected token: " + token)
@@ -177,85 +180,16 @@ def _find_next_blank(self, target_str, start_index):
177180
pos = target_str.find(" ", start_index)
178181
return len(target_str)-1 if pos == -1 else pos
179182

180-
181-
def _find_next_unescaped_quotes(self, target_str, start_index):
182-
pos = target_str.find('"', start_index)
183-
while pos != -1:
184-
if target_str[pos-1] != "\\":
185-
return pos # not escaped
186-
# if pos >= 2 and target_str[pos-2] == '\\':
187-
# return pos # the scape is scaped, so not escaped
188-
if self._count_prior_backslashes(an_str=target_str,
189-
quote_pos=pos) % 2 == 0:
190-
return pos # the scape is scaped, so not escaped
191-
pos = target_str.find('"', pos+1)
192-
if pos == -1:
193-
raise ValueError("Is this line malformed? Can`t find quotes matching: " + target_str)
194-
195-
def _count_prior_backslashes(self, an_str, quote_pos):
196-
"""
197-
We assume that there is at least a backslash at an_str[pos-1], so pos-1 is a non-negative index of an_str
198-
"""
199-
counter = 1
200-
quote_pos -= 2
201-
while quote_pos >= 0:
202-
if an_str[quote_pos] == "\\":
203-
counter += 1
204-
else:
205-
return counter
206-
quote_pos -= 1
207-
return counter
208-
209-
210183
def _find_next_quoted_literal_ending(self, target_str, start_index):
211-
next_quotes = self._find_next_unescaped_quotes(target_str=target_str,
212-
start_index=start_index+1)
184+
next_quotes = find_next_unescaped_quotes(target_str=target_str,
185+
start_index=start_index+1)
213186
if next_quotes +1 > len(target_str) or target_str[next_quotes + 1] == " ":
214187
return next_quotes
215188
elif target_str[next_quotes + 1] in _SPECIAL_CHARS_AFTER_QUOTES:
216189
return self._find_next_blank(target_str, next_quotes) - 1
217190
else:
218191
raise ValueError("Malformed literal? It seems like there is a problem of unmatching quotes: " + target_str)
219192

220-
def _process_line(self, str_line):
221-
str_line = self._clean_line(str_line)
222-
if str_line == "":
223-
self._process_empty_line(str_line)
224-
elif '"' in str_line:
225-
self._process_line_with_literal(str_line)
226-
elif str_line.startswith("@prefix"):
227-
self._process_prefix_line(str_line)
228-
elif str_line.startswith("@base"):
229-
self._process_base_line(str_line)
230-
elif str_line.startswith("#"):
231-
self._process_comment_line(str_line)
232-
elif str_line[-1] in [",", ".", ";"]:
233-
if ", " in str_line[:-1]:
234-
# If there is a comma in a URI, it can't be followed by a blank
235-
self._process_multi_triple_line_commas(str_line)
236-
else:
237-
self._process_single_triple_line(str_line)
238-
elif " " not in str_line:
239-
if len(str_line) > 1: # We are ensuring that this is not a single char, such as "," or "."
240-
self._process_isolated_subject(str_line)
241-
else:
242-
self._process_unknown_line(str_line)
243-
244-
def _process_line_with_literal(self, line):
245-
first_quotes_index = line.find('"')
246-
s_o_line = line[:first_quotes_index].strip()
247-
s_o_pieces = s_o_line.split(" ")
248-
if len(s_o_pieces) == 2:
249-
self._tmp_s = self._parse_elem(s_o_pieces[0])
250-
self._tmp_p = self._parse_elem(s_o_pieces[1])
251-
elif len(s_o_pieces) == 1 and s_o_pieces[0] != "":
252-
self._tmp_p = self._parse_elem(s_o_pieces[0])
253-
# The last char MUST be in [,.;] since this lines comes stripped.
254-
# SO everything between first_quotes_index and line[-1], stripped
255-
# should be out target literal (typed or not)
256-
self._tmp_o = line[first_quotes_index:-1].rstrip()
257-
self._decide_current_triple()
258-
259193
def _process_prefix_line(self, line):
260194
pieces = line.split(" ")
261195
prefix = pieces[1] if not pieces[1].endswith(":") else pieces[1][: - 1]
@@ -316,7 +250,7 @@ def _process_single_triple_line(self, line):
316250
self._decide_current_triple()
317251

318252
def _process_isolated_subject(self, line):
319-
# No splitt. Line is expected to contain a line with no blanks (isolated subject)
253+
# No split. Line is expected to contain a line with no blanks (isolated subject)
320254
self._tmp_s = self._parse_elem(line)
321255
# No need to decide triple now, incomplete element
322256

@@ -361,7 +295,7 @@ def _parse_elem(self, raw_elem):
361295
prefix_namespaces_dict=self._prefixes)
362296
elif raw_elem in _BOOLEANS or self._is_num_literal(raw_elem):
363297
return raw_elem
364-
# else?? shouldnt happen, let it break with a nullpoitner
298+
# else?? shouldn't happen, let it break with a nullpoitner
365299

366300
def _parse_cornered_element(self, cornered_element):
367301
if self._base is None:
@@ -398,13 +332,13 @@ def _read_normalized_lines(self):
398332
if not waiting and '"""' not in a_line:
399333
yield a_line
400334
elif waiting and '"""' not in a_line:
401-
tmp += self._scape_quotes_in_normalized_line(a_line)
335+
tmp += "\\n" + self._scape_quotes_in_normalized_line(a_line)
402336
elif not waiting and '"""' in a_line:
403337
waiting = True
404338
tmp = self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
405339
elif waiting and '"""' in a_line:
406340
waiting = False
407-
yield tmp + self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
341+
yield tmp + "\\n" + self._scape_quotes_in_normalized_line(a_line).replace('"""', '"', 1)
408342
tmp = ''
409343

410344
def _scape_quotes_in_normalized_line(self, target):

shexer/io/graph/yielder/nt_triples_yielder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from shexer.utils.log import log_msg
2-
from shexer.utils.uri import there_is_arroba_after_last_quotes
2+
from shexer.utils.literal import there_is_arroba_after_last_quotes
33
from shexer.utils.triple_yielders import tune_prop, tune_token # , check_if_property_belongs_to_namespace_list
44
from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
55

shexer/io/graph/yielder/rdflib_triple_yielder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from shexer.model.bnode import BNode as model_BNode
88
from shexer.model.property import Property as model_Property
99

10-
from shexer.utils.uri import decide_literal_type
10+
from shexer.utils.literal import decide_literal_type
1111
from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file, get_content_xz_file
1212

1313
_SUPPORTED_FORMATS = [N3, TURTLE, RDF_XML, NT, JSON_LD]

shexer/io/rdfconfig/formater/rdfconfig_serializer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def _shape_tag_for_var_name(self, class_uri):
109109
else:
110110
last_piece = last_piece[last_piece[:-1].rfind("/") + 1:]
111111
return last_piece
112+
112113
def _create_subject_name_for_shape(self, shape_uri):
113114
shape_uri.replace("_", "")
114115
shape_uri.replace("-", "")
@@ -171,7 +172,8 @@ def _serialize_constraint(self, shape, constraint):
171172
elif not example_cons.startswith('"'):
172173
example_cons = f'"{example_cons}"'
173174
if len(example_cons) >= 2:
174-
example_cons = example_cons[0] + example_cons[1:-1].replace('"', '\\"') + example_cons[-1]
175+
# example_cons = example_cons[0] + example_cons[1:-1].replace('"', '\\"') + example_cons[-1]
176+
example_cons = example_cons[0] + example_cons[1:-1] + example_cons[-1]
175177
self._write_shape_line(indentation=_PROPERTY_INDENT_LEVEL,
176178
content=f"{st_property}:")
177179
self._write_shape_line(indentation=_CONSTRAINT_INDENT_LEVEL,

shexer/io/shacl/formater/shacl_serializer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from shexer.model.shape import STARTING_CHAR_FOR_SHAPE_NAME
33
from rdflib import Graph, Namespace, URIRef, RDF, BNode, XSD, Literal
44
from shexer.model.statement import POSITIVE_CLOSURE, KLEENE_CLOSURE, OPT_CARDINALITY
5-
from shexer.utils.uri import XSD_NAMESPACE, LANG_STRING_TYPE
5+
from shexer.utils.literal import XSD_NAMESPACE, LANG_STRING_TYPE
66
from shexer.model.const_elem_types import IRI_ELEM_TYPE, LITERAL_ELEM_TYPE, DOT_ELEM_TYPE, BNODE_ELEM_TYPE
77
from shexer.io.wikidata import wikidata_annotation
88
from wlighter import TURTLE_FORMAT

shexer/io/shape_map/node_selector/node_selector_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11

2-
from shexer.utils.uri import remove_corners, add_corners, RDF_TYPE
2+
from shexer.utils.uri import remove_corners, add_corners
3+
from shexer.utils.literal import RDF_TYPE
34
from shexer.model.node_selector import NodeSelectorNoSparql, NodeSelectorSparql
45
from rdflib.plugins import sparql
56
import re

shexer/shaper.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,16 +286,14 @@ def shex_graph(self, string_output=False,
286286
rdfconfig_directory=rdfconfig_directory,
287287
verbose=verbose)
288288
current_result = serializer.serialize_shapes()
289-
if current_result is None:
290-
current_result = ""
291289

292290
if string_output or output_file is not None:
293291
log_msg(verbose=verbose,
294292
msg="Generating text serialization...")
295293
serializer = self._build_shapes_serializer(target_file=output_file,
296294
string_return=string_output,
297295
output_format=output_format,
298-
rdfconfig_directory=rdfconfig_directory,
296+
rdfconfig_directory=None,
299297
verbose=verbose)
300298
res = serializer.serialize_shapes()
301299
if string_output:

shexer/utils/literal.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
2+
XSD_NAMESPACE = "http://www.w3.org/2001/XMLSchema#"
3+
XSD_PREFIX = "xsd"
4+
5+
RDF_SYNTAX_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
6+
RDF_PREFIX = "rdf"
7+
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
8+
9+
DT_NAMESPACE = "http://dbpedia.org/datatype/"
10+
DT_PREFIX = "dt"
11+
12+
OPENGIS_NAMESPACE = "http://www.opengis.net/ont/geosparql#"
13+
OPENGIS_PREFIX = "geo"
14+
15+
LANG_STRING_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"
16+
STRING_TYPE = "http://www.w3.org/2001/XMLSchema#string"
17+
FLOAT_TYPE = "http://www.w3.org/2001/XMLSchema#float"
18+
INTEGER_TYPE = "http://www.w3.org/2001/XMLSchema#integer"
19+
20+
from shexer.model.shape import STARTING_CHAR_FOR_SHAPE_NAME
21+
22+
def _count_prior_backslashes(an_str, quote_pos):
23+
"""
24+
We assume that there is at least a backslash at an_str[pos-1], so pos-1 is a non-negative index of an_str
25+
"""
26+
counter = 1
27+
quote_pos -= 2
28+
while quote_pos >= 0:
29+
if an_str[quote_pos] == "\\":
30+
counter += 1
31+
else:
32+
return counter
33+
quote_pos -= 1
34+
return counter
35+
36+
def find_next_unescaped_quotes(target_str, start_index):
37+
pos = target_str.find('"', start_index)
38+
while pos != -1:
39+
if target_str[pos - 1] != "\\":
40+
return pos # not escaped
41+
# if pos >= 2 and target_str[pos-2] == '\\':
42+
# return pos # the scape is scaped, so not escaped
43+
if _count_prior_backslashes(an_str=target_str,
44+
quote_pos=pos) % 2 == 0:
45+
return pos # the scape is scaped, so not escaped
46+
pos = target_str.find('"', pos + 1)
47+
if pos == -1:
48+
raise ValueError("Is this line malformed? Can`t find quotes matching: " + target_str)
49+
50+
def there_is_arroba_after_last_quotes(target_str):
51+
if target_str.rfind(STARTING_CHAR_FOR_SHAPE_NAME) > target_str.rfind('"'):
52+
return True
53+
return False
54+
55+
def decide_literal_type(a_literal, base_namespace=None):
56+
if there_is_arroba_after_last_quotes(a_literal):
57+
return LANG_STRING_TYPE
58+
elif "\"^^" not in a_literal:
59+
return STRING_TYPE
60+
elif "xsd:" in a_literal:
61+
return XSD_NAMESPACE + a_literal[a_literal.find("xsd:") + 4:]
62+
elif "rdf:" in a_literal:
63+
return RDF_SYNTAX_NAMESPACE + a_literal[a_literal.find("rdf:")+ 4:]
64+
elif "dt:" in a_literal:
65+
return DT_NAMESPACE + a_literal[a_literal.find("dt:")+ 3:]
66+
elif "geo:" in a_literal:
67+
return OPENGIS_NAMESPACE + a_literal[a_literal.find("geo:") + 4:]
68+
elif XSD_NAMESPACE in a_literal or RDF_SYNTAX_NAMESPACE in a_literal \
69+
or DT_NAMESPACE in a_literal or OPENGIS_NAMESPACE in a_literal:
70+
return a_literal[a_literal.find("\"^^")+4:-1]
71+
elif a_literal.strip().endswith(">"):
72+
candidate_type = a_literal[a_literal.find("\"^^") + 4:-1] # plain uri, no corners
73+
if base_namespace is not None and not candidate_type.startswith("http"):
74+
return base_namespace + candidate_type
75+
return candidate_type
76+
else:
77+
raise RuntimeError("Unrecognized literal type:" + a_literal)
78+
79+
def parse_literal(an_elem, base_namespace=None):
80+
closing_quotes = find_next_unescaped_quotes(an_elem, 1)
81+
content = an_elem[1:closing_quotes].replace("\\\"", "\"")
82+
elem_type = decide_literal_type(a_literal=an_elem,
83+
base_namespace=base_namespace)
84+
return content, elem_type
85+
86+
def parse_unquoted_literal(an_elem):
87+
elem_type = decide_literal_type(an_elem)
88+
return an_elem, elem_type
89+

shexer/utils/structures/dicts.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,16 @@ def _get_constraint_example_inverse(self, shape_id, prop, inverse):
7070
def _set_constraint_example_no_inverse(self, shape_id, prop_id, example):
7171
if shape_id not in self._base_dict:
7272
self._init_shape(shape_id)
73-
self._base_dict[shape_id][_PROP_FEATURES_POS][prop_id] = example
73+
self._base_dict[shape_id][_PROP_FEATURES_POS][prop_id] = self._normalize_example(example)
7474

7575
def _set_constraint_example_inverse(self, shape_id, prop_id, example, inverse):
7676
if shape_id not in self._base_dict:
7777
self._init_shape(shape_id)
78-
self._base_dict[shape_id][_PROP_FEATURES_POS][_POS_INVERSE if inverse else _POS_DIRECT][prop_id] = example
78+
self._base_dict[shape_id][_PROP_FEATURES_POS][_POS_INVERSE if inverse else _POS_DIRECT][prop_id] = self._normalize_example(example)
79+
80+
def _normalize_example(self, example):
81+
result = example.replace("\n", "\\n")
82+
return result.replace("\"", '\\"')
7983

8084
def _has_constraint_example_no_inverse(self, shape_id, prop_id):
8185
if shape_id not in self._base_dict:

0 commit comments

Comments
 (0)