@@ -61,6 +61,8 @@ def c_no_empty(self, sentence):
6161 return sentence != ""
6262
6363 def c_no_titles (self , sentence ):
64+ if len (sentence ) == 0 :
65+ return True
6466 return len (sentence .strip ().split (" " )) > 1
6567
6668 def c_not_too_long (self , sentence ):
@@ -85,12 +87,16 @@ def c_no_bad_encoding(self, sentence):
8587 return True
8688
8789 def c_no_only_symbols (self , sentence ):
90+ if len (sentence ) == 0 :
91+ return True
8892 return len (regex_alpha .findall (sentence )) / len (sentence ) > 0.1
8993
9094 def c_no_only_numbers (self , sentence ):
9195 threshold = 0.5
9296 if self .language in CJK :
9397 threshold = 0.7
98+ if len (sentence ) == 0 :
99+ return True
94100 return len (regex_numbers .findall (sentence )) / len (sentence ) < threshold
95101
96102 def c_no_urls (self , sentence ):
@@ -163,7 +169,7 @@ def c_no_repeated_words(self, sentence):
163169 return True
164170
165171 def z_no_wrong_language (self , sentence ):
166- if not self .disable_lang_ident :
172+ if ( not self .disable_lang_ident ) and len ( sentence ) > 0 :
167173 # Obtain fastspell prediction, lowercasing helps in small langs
168174 langid = self .fastspell .getlang (sentence .lower ())
169175
@@ -224,6 +230,7 @@ def initialization():
224230 parser .add_argument ("--detect_script" , action = 'store_true' , help = "Detect writing script with FastSpell (only Serbo-Croatian is supported)" )
225231 parser .add_argument ("--annotated_output" , action = 'store_true' , help = "Add hardrules annotation for each sentence" )
226232 parser .add_argument ("--run_all_rules" , action = 'store_true' , help = "Run all hardrules for each sentence instead of stopping at the first one discarded" )
233+ parser .add_argument ('--dont_ignore_long' , default = False , action = 'store_true' , help = "Don't ignore too long sentences" )
227234 parser .add_argument ("--debug" , action = 'store_true' )
228235 parser .add_argument ("-q" , "--quiet" , action = 'store_true' )
229236 parser .add_argument ('-v' , '--version' , action = 'version' , version = "%(prog)s " + __version__ , help = "show version of this script and exit" )
@@ -268,39 +275,49 @@ def main():
268275 hardrules = Hardrules (args )
269276
270277 nline = 0
278+
271279 for line in args .input :
272280 nline += 1
281+ tag = ""
273282 parts = line .rstrip ("\n " ).split ("\t " )
274283
275284 if len (parts ) >= args .scol :
276285 sentence = parts [args .scol - 1 ]
277286 else :
278287 logging .error (f" scol ({ args .scol } ) index above column number ({ len (parts )} ) on line { nline } " )
279- continue
288+ sentence = ""
289+ tag = "c_missing_columns"
290+ #continue
280291
281- hr_result = hardrules .wrong_segment (args , sentence )
282- tag = hr_result
283- langid = args .language
284292
285- # Language identification rule and output
286- if not args .disable_lang_ident :
287- # If run all rules is enabled, run the identification method.
288- # If it doesn't pass, then set the tag accordingly if other hardrules have failed.
289- if args .run_all_rules :
290- langid , res = hardrules .z_no_wrong_language (line )
291-
292- if not res :
293- if tag == 'keep' :
294- tag = 'no_wrong_language'
295- else :
296- tag += '+no_wrong_language'
297- else :
298- # If run all rules is disabled, then only run identification method when all other hardrules have passed
299- if tag == 'keep' :
300- langid , res = hardrules .z_no_wrong_language (line )
293+ if not args .dont_ignore_long and (len (line ) > 1024 ):
294+ tag = "c_not_too_long"
295+ #continue
296+
297+ if tag == "" :
298+ hr_result = hardrules .wrong_segment (args , sentence )
299+ tag = hr_result
300+ langid = args .language
301+
302+ # Language identification rule and output
303+ if (not args .disable_lang_ident ) and len (line ) > 0 :
304+ # If run all rules is enabled, run the identification method.
305+ # If it doesn't pass, then set the tag accordingly if other hardrules have failed.
306+ if args .run_all_rules :
307+ langid , res = hardrules .z_no_wrong_language (sentence )
301308
302309 if not res :
303- tag = 'no_wrong_language'
310+ if tag == 'keep' :
311+ tag = 'no_wrong_language'
312+ else :
313+ tag += '+no_wrong_language'
314+ else :
315+ # If run all rules is disabled, then only run identification method when all other hardrules have passed
316+ if tag == 'keep' :
317+ langid , res = hardrules .z_no_wrong_language (sentence )
318+
319+ if not res :
320+ tag = 'no_wrong_language'
304321
305322 score = 1
306323 if tag != "keep" :
0 commit comments