From 328d3dadceda1889c786656282788f3698a159a9 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 00:29:19 +0200 Subject: [PATCH 01/14] fixed typo --- pylanguagetool/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index be46ff3..bbdcd38 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -160,15 +160,15 @@ def colored(text, color): length = context_object["length"] offset = context_object["offset"] - endpostion = offset + length + endposition = offset + length print(error["message"]) print( indention[:2] + cross + colored(context[:offset], Fore.LIGHTBLACK_EX) + - colored(context[offset:endpostion], Fore.LIGHTRED_EX) + - colored(context[endpostion:], Fore.LIGHTBLACK_EX) + colored(context[offset:endposition], Fore.LIGHTRED_EX) + + colored(context[endposition:], Fore.LIGHTBLACK_EX) ) print( indention + @@ -184,7 +184,7 @@ def colored(text, color): tick + colored(context[:offset], Fore.LIGHTBLACK_EX) + colored(replacement["value"], Fore.LIGHTGREEN_EX) + - colored(context[endpostion:], Fore.LIGHTBLACK_EX) + colored(context[endposition:], Fore.LIGHTBLACK_EX) ) rule = error["rule"] if rules: From d6ee6cdb32dfb49df5196bd014243868d76e4579 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 00:29:34 +0200 Subject: [PATCH 02/14] implemented approximation for finding line numbers --- pylanguagetool/__init__.py | 74 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index bbdcd38..7a9176f 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -41,6 +41,7 @@ def init_config(): p.add_argument('-l', '--lang', env_var='TEXTLANG', default="auto", help="A language code like en or en-US, or auto to guess the language automatically (see preferredVariants below). For languages with variants (English, German, Portuguese) spell checking will only be activated when you specify the variant, e.g. en-GB instead of just en." ) + p.add_argument("--lines", env_var="LINES", action="store_true", help="show line numbers of found mistakes") p.add_argument("-m", "--mother-tongue", env_var="MOTHER__TONGUE", help="A language code of the user's native language, enabling false friends checks for some language pairs." ) @@ -127,7 +128,52 @@ def get_input_text(config): return None, None -def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False): +def fuzzy_substring(needle, haystack): + """Calculates the fuzzy match of needle in haystack, + using a modified version of the Levenshtein distance + algorithm. + The function is modified from the levenshtein function + in the bktree module by Adam Hupp. + + Taken and modified from: + http://ginstrom.com/scribbles/2007/12/01/fuzzy-substring-matching-with-levenshtein-distance-in-python/ + + Returns: + index of first most likely match (first match with minimal edit + distance) + """ + m, n = len(needle), len(haystack) + + # base cases + if m == 1: + return not needle in haystack + if not n: + return m + + row1 = [0] * (n+1) + for i in range(0,m): + row2 = [i+1] + for j in range(0,n): + cost = ( needle[i] != haystack[j] ) + + row2.append( min(row1[j+1]+1, # deletion + row2[j]+1, #insertion + row1[j]+cost) #substitution + ) + # we don't need the original row1 in the future. + # In the end, we only want the last row. + row1 = row2 + + # the index of the lowest number in this row will tell us the location of + # the best match. + return row1.index(min(row1)), min(row1) + + +def line_from_offset(offset: int, text: str) -> int: + return len(text[:offset].split('\n')) + + +def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False, original=''): matches = response["matches"] language = response["language"] version = response["software"]["name"] + " " + response["software"]["version"] @@ -161,6 +207,29 @@ def colored(text, color): offset = context_object["offset"] endposition = offset + length + + # dict_items([('message', 'Possible spelling mistake. ‘Favourite’ is + # British English.'), ('shortMessage', ''), ('replacements', [{'value': + # 'Favorite', 'shortDescription': 'English (US)'}]), ('offset', 551), + # ('length', 9), ('context', {'text': '...m would I recommend reading + # this book? Favourite Quotes ', 'offset': 43, 'length': 9}), + # ('sentence', 'Favourite Quotes'), ('type', {'typeName': 'Other'}), + # ('rule', {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible + # spelling mistake', 'issueType': 'misspelling', 'category': {'id': + # 'TYPOS', 'name': 'Possible Typo'}}), ('ignoreForIncompleteSentence', + # False), ('contextForSureMatch', 0)]) + # print(error.items()) + # print(offset + error["offset"]) + # print(fuzzy_substring(context, original)) + total_offset = offset + error["offset"] + line_html = line_from_offset(total_offset, original) + offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:]) + line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) + + print("Score %.02f around line %i" % (1 - m / len(context), line_fuzz)) + # print("Matching: \"" + context[3:-3] + "\" with score " + str(m)) + # print("Match: \"" + original[(offset_fuzz - len(context)):(offset_fuzz)] + "\"") + print(error["message"]) print( @@ -259,7 +328,8 @@ def main(): not config["no_color"], config["rules"], config["rule_categories"], - config["explain_rule"] + config["explain_rule"], + input_text ) if len(response["matches"]) > 0: From e8598900c1881e50391f949904da9c70f5cccdb7 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 09:43:36 +0200 Subject: [PATCH 03/14] also install test dependencies --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8b41809..88cdc8e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,8 @@ python: # command to install dependencies before_install: - pip install poetry + - pip install markdown2 + - pip install beautifulsoup4 install: - poetry install # command to run tests From 40e38cceed2d43aa6d90bb0f4428b8714727c372 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 10:05:59 +0200 Subject: [PATCH 04/14] fixed location calculation to a significant degree, still is off by newlines sometimes though --- pylanguagetool/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index 7a9176f..4c68610 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -166,7 +166,7 @@ def fuzzy_substring(needle, haystack): # the index of the lowest number in this row will tell us the location of # the best match. - return row1.index(min(row1)), min(row1) + return row1.index(min(row1)) - len(needle), min(row1) def line_from_offset(offset: int, text: str) -> int: From ea4e5a31c9e549c23ce8ced6265e5c5d5c089676 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 10:20:12 +0200 Subject: [PATCH 05/14] fix error when original is empty --- pylanguagetool/__init__.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index 4c68610..da75cf5 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -208,27 +208,14 @@ def colored(text, color): endposition = offset + length - # dict_items([('message', 'Possible spelling mistake. ‘Favourite’ is - # British English.'), ('shortMessage', ''), ('replacements', [{'value': - # 'Favorite', 'shortDescription': 'English (US)'}]), ('offset', 551), - # ('length', 9), ('context', {'text': '...m would I recommend reading - # this book? Favourite Quotes ', 'offset': 43, 'length': 9}), - # ('sentence', 'Favourite Quotes'), ('type', {'typeName': 'Other'}), - # ('rule', {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible - # spelling mistake', 'issueType': 'misspelling', 'category': {'id': - # 'TYPOS', 'name': 'Possible Typo'}}), ('ignoreForIncompleteSentence', - # False), ('contextForSureMatch', 0)]) - # print(error.items()) - # print(offset + error["offset"]) - # print(fuzzy_substring(context, original)) - total_offset = offset + error["offset"] - line_html = line_from_offset(total_offset, original) - offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:]) - line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) - - print("Score %.02f around line %i" % (1 - m / len(context), line_fuzz)) - # print("Matching: \"" + context[3:-3] + "\" with score " + str(m)) - # print("Match: \"" + original[(offset_fuzz - len(context)):(offset_fuzz)] + "\"") + if original: + total_offset = offset + error["offset"] + line_html = line_from_offset(total_offset, original) + offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:]) + line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) + # print("line: %i, fuzz: %i, offset: %i, error: %i" % (line_fuzz, offset_fuzz, offset, error["offset"])) + + print("Score of [%.02f] around line [%i]" % (1 - m / len(context), line_fuzz)) print(error["message"]) From 48f75634c2ed4fa796405b00c129220d7789d37a Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 13:39:55 +0200 Subject: [PATCH 06/14] only figure out line numbers when passed --lines --- pylanguagetool/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index da75cf5..b543233 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -173,7 +173,7 @@ def line_from_offset(offset: int, text: str) -> int: return len(text[:offset].split('\n')) -def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False, original=''): +def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False, lines=False, original=''): matches = response["matches"] language = response["language"] version = response["software"]["name"] + " " + response["software"]["version"] @@ -208,7 +208,7 @@ def colored(text, color): endposition = offset + length - if original: + if original and lines: total_offset = offset + error["offset"] line_html = line_from_offset(total_offset, original) offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:]) @@ -316,6 +316,7 @@ def main(): config["rules"], config["rule_categories"], config["explain_rule"], + config["lines"], input_text ) From 24485acbbd8fb52d184aecb503d6ea0c91137aa9 Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 13:40:03 +0200 Subject: [PATCH 07/14] added upper search bound to increase speed --- pylanguagetool/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index b543233..dfbd7c6 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -210,8 +210,11 @@ def colored(text, color): if original and lines: total_offset = offset + error["offset"] + # calculates the lower offset bound based on the html converted line_html = line_from_offset(total_offset, original) - offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:]) + max_offset = int((error["offset"] + len(context)) * 1.3) + # get the expected offset based on fuzzy matching with a highly likely + offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:max_offset]) line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) # print("line: %i, fuzz: %i, offset: %i, error: %i" % (line_fuzz, offset_fuzz, offset, error["offset"])) From aded81cd3c4b209f632d1602190648fdc0b7548f Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 13:41:08 +0200 Subject: [PATCH 08/14] added information that it will slow things down --- pylanguagetool/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index dfbd7c6..9b6cce1 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -41,7 +41,7 @@ def init_config(): p.add_argument('-l', '--lang', env_var='TEXTLANG', default="auto", help="A language code like en or en-US, or auto to guess the language automatically (see preferredVariants below). For languages with variants (English, German, Portuguese) spell checking will only be activated when you specify the variant, e.g. en-GB instead of just en." ) - p.add_argument("--lines", env_var="LINES", action="store_true", help="show line numbers of found mistakes") + p.add_argument("--lines", env_var="LINES", action="store_true", help="show approximate line numbers of found mistakes. Noticably slow for large files") p.add_argument("-m", "--mother-tongue", env_var="MOTHER__TONGUE", help="A language code of the user's native language, enabling false friends checks for some language pairs." ) From 1e35fc0598f62733315f92e11eb08e7fd24a95bd Mon Sep 17 00:00:00 2001 From: fkarg Date: Sun, 13 Sep 2020 21:39:19 +0200 Subject: [PATCH 09/14] added some detailed comments --- pylanguagetool/__init__.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index 9b6cce1..c89421c 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -208,17 +208,32 @@ def colored(text, color): endposition = offset + length + # if we have an original text and want to search for approximate line + # numbers if original and lines: - total_offset = offset + error["offset"] - # calculates the lower offset bound based on the html converted - line_html = line_from_offset(total_offset, original) - max_offset = int((error["offset"] + len(context)) * 1.3) - # get the expected offset based on fuzzy matching with a highly likely - offset_fuzz, m = fuzzy_substring(context[3:-3], original[error["offset"]:max_offset]) + # we need/want to overshoot to absolutely include the + # 'context'-snippet, but it's unclear by how much. Minimal + # overshoot reduces fuzzy matching speed + MAGIC_FACTOR = 1.1 + + # roughly estimate overshoot-offset. Not calculating it would mean + # considering the rest of the file, which is likely to be far + # bigger. Grows until about the last third of the file, and is + # limited by the end of the file after hitting the maximum + max_offset = int((error["offset"] + len(context)) * MAGIC_FACTOR) + + # search for the closest match of the context string within a + # heuristically shortened part of the unmodified text + offset_fuzz, m = fuzzy_substring(context[3:-3], + original[error["offset"]:max_offset]) + + # from the offset, calculate the approximate location line number line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) - # print("line: %i, fuzz: %i, offset: %i, error: %i" % (line_fuzz, offset_fuzz, offset, error["offset"])) - print("Score of [%.02f] around line [%i]" % (1 - m / len(context), line_fuzz)) + # Note that this will always print an approximate line number, even + # if it is completely wrong. However, the score should be + # increadibly low for unlikely matches. + print("Score of [%.02f] around line %i" % (1 - m / len(context), line_fuzz)) print(error["message"]) From ed197bbf06c77046dd92afe87c8a07d804f8af9c Mon Sep 17 00:00:00 2001 From: fkarg Date: Wed, 30 Sep 2020 23:12:21 +0200 Subject: [PATCH 10/14] add option to deactivate unsound heuristics --- pylanguagetool/__init__.py | 50 +++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index c89421c..727c2c9 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -42,6 +42,7 @@ def init_config(): help="A language code like en or en-US, or auto to guess the language automatically (see preferredVariants below). For languages with variants (English, German, Portuguese) spell checking will only be activated when you specify the variant, e.g. en-GB instead of just en." ) p.add_argument("--lines", env_var="LINES", action="store_true", help="show approximate line numbers of found mistakes. Noticably slow for large files") + p.add_argument("--only-sound", env_var="SOUND", action="store_true", help="Do not use heuristics to reduce line number search space. Noticably slower but only way to get accurate line numbers for files with many comments.") p.add_argument("-m", "--mother-tongue", env_var="MOTHER__TONGUE", help="A language code of the user's native language, enabling false friends checks for some language pairs." ) @@ -173,7 +174,7 @@ def line_from_offset(offset: int, text: str) -> int: return len(text[:offset].split('\n')) -def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False, lines=False, original=''): +def print_errors(response, api_url, print_color=True, rules=False, rule_categories=False, explain_rule=False, lines=False, sound=False, original=''): matches = response["matches"] language = response["language"] version = response["software"]["name"] + " " + response["software"]["version"] @@ -199,6 +200,7 @@ def colored(text, color): cross = colored(u"\u2717", Fore.LIGHTRED_EX) + " " rule_explanations = [] + last_match = 0 for error in matches: context_object = error["context"] @@ -211,21 +213,34 @@ def colored(text, color): # if we have an original text and want to search for approximate line # numbers if original and lines: - # we need/want to overshoot to absolutely include the - # 'context'-snippet, but it's unclear by how much. Minimal - # overshoot reduces fuzzy matching speed - MAGIC_FACTOR = 1.1 - - # roughly estimate overshoot-offset. Not calculating it would mean - # considering the rest of the file, which is likely to be far - # bigger. Grows until about the last third of the file, and is - # limited by the end of the file after hitting the maximum - max_offset = int((error["offset"] + len(context)) * MAGIC_FACTOR) - - # search for the closest match of the context string within a - # heuristically shortened part of the unmodified text - offset_fuzz, m = fuzzy_substring(context[3:-3], - original[error["offset"]:max_offset]) + if sound: + # only use the minimal sound guesses we can make. + # makes calculation expensive, but necessary when there are + # large sections commented out + min_sound = max(last_match - len(context), 0) + offset_fuzz, m = fuzzy_substring(context[3:-3], + original[min_sound:]) + + # the next error will have a higher offset than this one. Start + # searching here next time. + last_match = offset_fuzz + else: + # heuristically approximate our search space. Works fine for + # most markdown files. + + # we need/want to overshoot to absolutely include the + # 'context'-snippet, but it's unclear by how much. Lower + # overshoot increases fuzzy matching speed + MAGIC_FACTOR = 1.1 + + # roughly estimate overshoot-offset. Not calculating it would mean + # considering the rest of the file, which is likely to be far + # bigger. Grows until about the last third of the file, and is + # limited by the end of the file after hitting the maximum + max_offset = int((error["offset"] + len(context)) * MAGIC_FACTOR) + + offset_fuzz, m = fuzzy_substring(context[3:-3], + original[error["offset"]:max_offset]) # from the offset, calculate the approximate location line number line_fuzz = line_from_offset(offset_fuzz + offset + error["offset"], original) @@ -335,7 +350,8 @@ def main(): config["rule_categories"], config["explain_rule"], config["lines"], - input_text + input_text, + config["sound"] ) if len(response["matches"]) > 0: From 820853b7dbc41d7d610fd74c0ea553e8a7f40ecf Mon Sep 17 00:00:00 2001 From: fkarg Date: Thu, 1 Oct 2020 23:31:50 +0200 Subject: [PATCH 11/14] fix parameter order --- pylanguagetool/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pylanguagetool/__init__.py b/pylanguagetool/__init__.py index 727c2c9..ed8e7a2 100644 --- a/pylanguagetool/__init__.py +++ b/pylanguagetool/__init__.py @@ -350,8 +350,8 @@ def main(): config["rule_categories"], config["explain_rule"], config["lines"], - input_text, - config["sound"] + config["only_sound"], + input_text ) if len(response["matches"]) > 0: From 5aa4c55024ae1b9d818240e276745467f0cefd0e Mon Sep 17 00:00:00 2001 From: fkarg Date: Fri, 2 Oct 2020 00:08:00 +0200 Subject: [PATCH 12/14] fix travis build --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 88cdc8e..2ab9fb6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,8 +11,9 @@ before_install: - pip install poetry - pip install markdown2 - pip install beautifulsoup4 -install: - poetry install +install: + - pip install -e . # command to run tests script: - python -m pytest From 5d53b98fc776036f3eeee22349b34b84e20446c7 Mon Sep 17 00:00:00 2001 From: fkarg Date: Fri, 2 Oct 2020 00:35:39 +0200 Subject: [PATCH 13/14] install both build and dev dependencies --- .travis.yml | 6 +++--- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2ab9fb6..161f1bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,9 +9,9 @@ python: # command to install dependencies before_install: - pip install poetry - - pip install markdown2 - - pip install beautifulsoup4 - - poetry install + - poetry install --no-dev + - poetry install --no-root +# install and link python package install: - pip install -e . # command to run tests diff --git a/pyproject.toml b/pyproject.toml index 3a5d850..40b59d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ docutils = "^0.16" markdown2 = "^2.3.8" beautifulsoup4 = "^4.8.2" sphinx = "^2.3.1" +setuptools = "^50.3.0" [build-system] requires = ["poetry>=0.12"] From e8488f06cbc72f52e583609bafdf42a9f18dbd98 Mon Sep 17 00:00:00 2001 From: fkarg Date: Fri, 2 Oct 2020 00:40:56 +0200 Subject: [PATCH 14/14] ensure pip wheel and setuptools installed --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 161f1bf..c3555cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ python: - "pypy3.5-7.0" # command to install dependencies before_install: + - pip install --upgrade pip setuptools wheel - pip install poetry - poetry install --no-dev - poetry install --no-root