11from shexer .io .graph .yielder .base_triples_yielder import BaseTriplesYielder
22from shexer .utils .uri import remove_corners , unprefixize_uri_mandatory
3+ from shexer .utils .literal import find_next_unescaped_quotes
34from shexer .utils .triple_yielders import tune_subj , tune_prop , tune_token
45import re
56
@@ -68,7 +69,7 @@ def __init__(self, source_file=None, allow_untyped_numbers=True, raw_graph=None,
6869 def yield_triples (self ):
6970 self ._reset_parsing ()
7071 for a_line in self ._read_normalized_lines ():
71- for a_triple in self ._process_line_2 (a_line ):
72+ for a_triple in self ._process_line (a_line ):
7273 self ._triples_count += 1
7374 yield (
7475 tune_subj (a_triple [_S ],
@@ -89,7 +90,7 @@ def _clean_line(self, str_line):
8990
9091 def _remove_comments_if_needed (self , str_line ):
9192 """Remove comments in the middle of the line.
92- Lines starting with # wont be erased
93+ Lines starting with # won't be erased
9394 """
9495 if '"' not in str_line : # Comment mark and no literals, trivial case
9596 return str_line [:str_line .find (" #" )]
@@ -108,7 +109,7 @@ def _remove_comments_if_needed(self, str_line):
108109 return str_line # If this point is reached, it means that the potential comments
109110 # are actual content of a string literal
110111
111- def _process_line_2 (self , str_line ):
112+ def _process_line (self , str_line ):
112113 str_line = self ._clean_line (str_line )
113114 if str_line == "" :
114115 self ._process_empty_line (str_line )
@@ -150,6 +151,8 @@ def _assing_tmp_element_and_promote_state(self, token):
150151 self ._state = _WAITING_FOR_OBJ
151152 elif self ._state == _WAITING_FOR_OBJ :
152153 self ._tmp_o = self ._parse_elem (token )
154+ if self ._tmp_o .startswith ('"' ):
155+ self ._tmp_o = self ._tmp_o .replace ('\\ \\ "' ,'\\ "' )
153156 self ._state = _NOT_WAITING
154157 else :
155158 raise ValueError ("Malformed file. Processing an unexpected token: " + token )
@@ -177,85 +180,16 @@ def _find_next_blank(self, target_str, start_index):
177180 pos = target_str .find (" " , start_index )
178181 return len (target_str )- 1 if pos == - 1 else pos
179182
180-
181- def _find_next_unescaped_quotes (self , target_str , start_index ):
182- pos = target_str .find ('"' , start_index )
183- while pos != - 1 :
184- if target_str [pos - 1 ] != "\\ " :
185- return pos # not escaped
186- # if pos >= 2 and target_str[pos-2] == '\\':
187- # return pos # the scape is scaped, so not escaped
188- if self ._count_prior_backslashes (an_str = target_str ,
189- quote_pos = pos ) % 2 == 0 :
190- return pos # the scape is scaped, so not escaped
191- pos = target_str .find ('"' , pos + 1 )
192- if pos == - 1 :
193- raise ValueError ("Is this line malformed? Can`t find quotes matching: " + target_str )
194-
195- def _count_prior_backslashes (self , an_str , quote_pos ):
196- """
197- We assume that there is at least a backslash at an_str[pos-1], so pos-1 is a non-negative index of an_str
198- """
199- counter = 1
200- quote_pos -= 2
201- while quote_pos >= 0 :
202- if an_str [quote_pos ] == "\\ " :
203- counter += 1
204- else :
205- return counter
206- quote_pos -= 1
207- return counter
208-
209-
210183 def _find_next_quoted_literal_ending (self , target_str , start_index ):
211- next_quotes = self . _find_next_unescaped_quotes (target_str = target_str ,
212- start_index = start_index + 1 )
184+ next_quotes = find_next_unescaped_quotes (target_str = target_str ,
185+ start_index = start_index + 1 )
213186 if next_quotes + 1 > len (target_str ) or target_str [next_quotes + 1 ] == " " :
214187 return next_quotes
215188 elif target_str [next_quotes + 1 ] in _SPECIAL_CHARS_AFTER_QUOTES :
216189 return self ._find_next_blank (target_str , next_quotes ) - 1
217190 else :
218191 raise ValueError ("Malformed literal? It seems like there is a problem of unmatching quotes: " + target_str )
219192
220- def _process_line (self , str_line ):
221- str_line = self ._clean_line (str_line )
222- if str_line == "" :
223- self ._process_empty_line (str_line )
224- elif '"' in str_line :
225- self ._process_line_with_literal (str_line )
226- elif str_line .startswith ("@prefix" ):
227- self ._process_prefix_line (str_line )
228- elif str_line .startswith ("@base" ):
229- self ._process_base_line (str_line )
230- elif str_line .startswith ("#" ):
231- self ._process_comment_line (str_line )
232- elif str_line [- 1 ] in ["," , "." , ";" ]:
233- if ", " in str_line [:- 1 ]:
234- # If there is a comma in a URI, it can't be followed by a blank
235- self ._process_multi_triple_line_commas (str_line )
236- else :
237- self ._process_single_triple_line (str_line )
238- elif " " not in str_line :
239- if len (str_line ) > 1 : # We are ensuring that this is not a single char, such as "," or "."
240- self ._process_isolated_subject (str_line )
241- else :
242- self ._process_unknown_line (str_line )
243-
244- def _process_line_with_literal (self , line ):
245- first_quotes_index = line .find ('"' )
246- s_o_line = line [:first_quotes_index ].strip ()
247- s_o_pieces = s_o_line .split (" " )
248- if len (s_o_pieces ) == 2 :
249- self ._tmp_s = self ._parse_elem (s_o_pieces [0 ])
250- self ._tmp_p = self ._parse_elem (s_o_pieces [1 ])
251- elif len (s_o_pieces ) == 1 and s_o_pieces [0 ] != "" :
252- self ._tmp_p = self ._parse_elem (s_o_pieces [0 ])
253- # The last char MUST be in [,.;] since this lines comes stripped.
254- # SO everything between first_quotes_index and line[-1], stripped
255- # should be out target literal (typed or not)
256- self ._tmp_o = line [first_quotes_index :- 1 ].rstrip ()
257- self ._decide_current_triple ()
258-
259193 def _process_prefix_line (self , line ):
260194 pieces = line .split (" " )
261195 prefix = pieces [1 ] if not pieces [1 ].endswith (":" ) else pieces [1 ][: - 1 ]
@@ -316,7 +250,7 @@ def _process_single_triple_line(self, line):
316250 self ._decide_current_triple ()
317251
318252 def _process_isolated_subject (self , line ):
319- # No splitt . Line is expected to contain a line with no blanks (isolated subject)
253+ # No split . Line is expected to contain a line with no blanks (isolated subject)
320254 self ._tmp_s = self ._parse_elem (line )
321255 # No need to decide triple now, incomplete element
322256
@@ -361,7 +295,7 @@ def _parse_elem(self, raw_elem):
361295 prefix_namespaces_dict = self ._prefixes )
362296 elif raw_elem in _BOOLEANS or self ._is_num_literal (raw_elem ):
363297 return raw_elem
364- # else?? shouldnt happen, let it break with a nullpoitner
298+ # else?? shouldn't happen, let it break with a nullpoitner
365299
366300 def _parse_cornered_element (self , cornered_element ):
367301 if self ._base is None :
@@ -398,13 +332,13 @@ def _read_normalized_lines(self):
398332 if not waiting and '"""' not in a_line :
399333 yield a_line
400334 elif waiting and '"""' not in a_line :
401- tmp += self ._scape_quotes_in_normalized_line (a_line )
335+ tmp += " \\ n" + self ._scape_quotes_in_normalized_line (a_line )
402336 elif not waiting and '"""' in a_line :
403337 waiting = True
404338 tmp = self ._scape_quotes_in_normalized_line (a_line ).replace ('"""' , '"' , 1 )
405339 elif waiting and '"""' in a_line :
406340 waiting = False
407- yield tmp + self ._scape_quotes_in_normalized_line (a_line ).replace ('"""' , '"' , 1 )
341+ yield tmp + " \\ n" + self ._scape_quotes_in_normalized_line (a_line ).replace ('"""' , '"' , 1 )
408342 tmp = ''
409343
410344 def _scape_quotes_in_normalized_line (self , target ):
0 commit comments