Skip to content

Commit 066750a

Browse files
committed
Fix issues with emtpy lines and too long lines
1 parent 60a4cbd commit 066750a

File tree

3 files changed

+46
-23
lines changed

3 files changed

+46
-23
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## v1.6.2
4+
- Fix divison by 0 error on empty sentences.
5+
- Fixed rules that were giving false positives on empty sentences (no titles, wrong language)
6+
- For performance, long setences (>1024 chars.) are ignored by default, only "not_too_long" is outputed. Added "--dont_ignore_long" flag to override this
7+
behaviour.
8+
39
## v1.6.1
410

511
### Changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "monocleaner"
3-
version = "1.6.1"
3+
version = "1.6.2"
44
requires-python = ">=3.8"
55
authors = [
66
{ name = "Prompsit Language Engineering", email = "info@prompsit.com" },

src/monocleaner/hardrules.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ def c_no_empty(self, sentence):
6161
return sentence != ""
6262

6363
def c_no_titles(self, sentence):
64+
if len(sentence) == 0:
65+
return True
6466
return len(sentence.strip().split(" ")) > 1
6567

6668
def c_not_too_long(self, sentence):
@@ -85,12 +87,16 @@ def c_no_bad_encoding(self, sentence):
8587
return True
8688

8789
def c_no_only_symbols(self, sentence):
90+
if len(sentence) == 0:
91+
return True
8892
return len(regex_alpha.findall(sentence)) / len(sentence) > 0.1
8993

9094
def c_no_only_numbers(self, sentence):
9195
threshold = 0.5
9296
if self.language in CJK:
9397
threshold = 0.7
98+
if len(sentence) == 0:
99+
return True
94100
return len(regex_numbers.findall(sentence)) / len(sentence) < threshold
95101

96102
def c_no_urls(self, sentence):
@@ -163,7 +169,7 @@ def c_no_repeated_words(self, sentence):
163169
return True
164170

165171
def z_no_wrong_language(self, sentence):
166-
if not self.disable_lang_ident:
172+
if (not self.disable_lang_ident) and len(sentence) > 0:
167173
# Obtain fastspell prediction, lowercasing helps in small langs
168174
langid = self.fastspell.getlang(sentence.lower())
169175

@@ -224,6 +230,7 @@ def initialization():
224230
parser.add_argument("--detect_script", action='store_true', help="Detect writing script with FastSpell (only Serbo-Croatian is supported)")
225231
parser.add_argument("--annotated_output", action='store_true', help="Add hardrules annotation for each sentence")
226232
parser.add_argument("--run_all_rules", action='store_true', help="Run all hardrules for each sentence instead of stopping at the first one discarded")
233+
parser.add_argument('--dont_ignore_long', default=False, action='store_true', help="Don't ignore too long sentences")
227234
parser.add_argument("--debug", action='store_true')
228235
parser.add_argument("-q", "--quiet", action='store_true')
229236
parser.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")
@@ -268,39 +275,49 @@ def main():
268275
hardrules = Hardrules(args)
269276

270277
nline = 0
278+
271279
for line in args.input:
272280
nline += 1
281+
tag = ""
273282
parts = line.rstrip("\n").split("\t")
274283

275284
if len(parts) >= args.scol:
276285
sentence = parts[args.scol-1]
277286
else:
278287
logging.error(f" scol ({args.scol}) index above column number ({len(parts)}) on line {nline}")
279-
continue
288+
sentence = ""
289+
tag = "c_missing_columns"
290+
#continue
280291

281-
hr_result = hardrules.wrong_segment(args, sentence)
282-
tag = hr_result
283-
langid = args.language
284292

285-
# Language identification rule and output
286-
if not args.disable_lang_ident:
287-
# If run all rules is enabled, run the identification method.
288-
# If it doesn't pass, then set the tag accordingly if other hardrules have failed.
289-
if args.run_all_rules:
290-
langid, res = hardrules.z_no_wrong_language(line)
291-
292-
if not res:
293-
if tag == 'keep':
294-
tag = 'no_wrong_language'
295-
else:
296-
tag += '+no_wrong_language'
297-
else:
298-
# If run all rules is disabled, then only run identification method when all other hardrules have passed
299-
if tag == 'keep':
300-
langid, res = hardrules.z_no_wrong_language(line)
293+
if not args.dont_ignore_long and (len(line) > 1024):
294+
tag = "c_not_too_long"
295+
#continue
296+
297+
if tag == "":
298+
hr_result = hardrules.wrong_segment(args, sentence)
299+
tag = hr_result
300+
langid = args.language
301+
302+
# Language identification rule and output
303+
if (not args.disable_lang_ident) and len(line) > 0:
304+
# If run all rules is enabled, run the identification method.
305+
# If it doesn't pass, then set the tag accordingly if other hardrules have failed.
306+
if args.run_all_rules:
307+
langid, res = hardrules.z_no_wrong_language(sentence)
301308

302309
if not res:
303-
tag = 'no_wrong_language'
310+
if tag == 'keep':
311+
tag = 'no_wrong_language'
312+
else:
313+
tag += '+no_wrong_language'
314+
else:
315+
# If run all rules is disabled, then only run identification method when all other hardrules have passed
316+
if tag == 'keep':
317+
langid, res = hardrules.z_no_wrong_language(sentence)
318+
319+
if not res:
320+
tag = 'no_wrong_language'
304321

305322
score = 1
306323
if tag != "keep":

0 commit comments

Comments
 (0)