-
-
Notifications
You must be signed in to change notification settings - Fork 613
Improve score by supporting extra_phrase for extra words in rules
#4432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 14 commits
685a16f
760eba3
b784806
88196da
f80963c
6ad990e
5f090e3
bf3417d
61284f4
490a081
060fd63
68ab63d
5b933c0
16d3364
2b00e48
897a7b6
cc5a82c
1cc9385
278b9ce
6e044da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -826,7 +826,11 @@ def to_dict( | |
| result['start_line'] = self.start_line | ||
| result['end_line'] = self.end_line | ||
| result['matcher'] = self.matcher | ||
| result['score'] = self.score() | ||
| # update score if `extra-words` are in right place | ||
| if(is_extra_words_position_valid(match=self)): | ||
| result['score'] = 100 | ||
| else: | ||
| result['score'] = self.score() | ||
| result['matched_length'] = self.len() | ||
| result['match_coverage'] = self.coverage() | ||
| result['rule_relevance'] = self.rule.relevance | ||
|
|
@@ -1071,6 +1075,84 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE): | |
| # early from the loops: trying to check containment on wildly separated matches | ||
| # does not make sense | ||
|
|
||
| def is_extra_words_position_valid(match): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if the scoring should not be more subtle. In practice, we can have multiple regions allowing for extra words. And some regions may have OK extra words, and some other may not have that, so this is may not be a binary True or False, with 100 or not 100 but instead a score computation that may be more nuanced? |
||
| """ | ||
| Return True if the extra words appear in valid positions and | ||
| do not exceed the maximum allowed word count at those positions. | ||
| Otherwise, return False. | ||
| """ | ||
| # Find `query_coverage_coefficient` such that match have `extra-words` or not | ||
| score_coverage_relevance = ( | ||
| match.coverage() * match.rule.relevance | ||
| ) / 100 | ||
|
|
||
| # Calculate the query coverage coefficient | ||
| query_coverage_coefficient = score_coverage_relevance - match.score() | ||
|
|
||
| # Return False if the match has no extra words | ||
| if query_coverage_coefficient == 0: | ||
| return False | ||
|
|
||
| matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False))) | ||
| rule_tokens = list(index_tokenizer(match.rule.text)) | ||
| extra_phrase_spans = match.rule.extra_phrase_spans | ||
|
|
||
| if not extra_phrase_spans: | ||
| return False | ||
|
|
||
| # count of `extra-words` tokens i.e inserted in `matched_tokens` | ||
| matched_count = 0 | ||
|
|
||
| # Count of extra phrase markers | ||
| extra_phrase_count = 0 | ||
|
|
||
| rule_index = 0 | ||
| matched_index = 0 | ||
|
|
||
| for span, allowed_extra_word in extra_phrase_spans: | ||
| rule_index = span.start | ||
|
|
||
| matched_index = span.start + matched_count - extra_phrase_count | ||
| extra_words_count = 0 | ||
|
|
||
| # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens` | ||
| if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]): | ||
| return False | ||
|
|
||
| # Count how many tokens in `matched_text` do not match the next rule token | ||
| while (matched_index < len(matched_tokens) and | ||
| matched_tokens[matched_index] != rule_tokens[rule_index + 1]): | ||
| matched_index += 1 | ||
| matched_count += 1 | ||
| extra_words_count += 1 | ||
|
|
||
| if extra_words_count > allowed_extra_word: | ||
| return False | ||
|
|
||
| extra_phrase_count += 1 | ||
|
|
||
| rule_index+=1 | ||
|
|
||
| # check if any `extra-words` is present after checking all `extra-phrase-spans` in rules | ||
| while (matched_index < len(matched_tokens) and | ||
| matched_tokens[matched_index] == rule_tokens[rule_index]): | ||
| matched_index+=1 | ||
| rule_index+=1 | ||
|
|
||
| # some `extra-words` are found | ||
| if matched_index != len(matched_tokens): | ||
| return False | ||
|
|
||
| return True | ||
|
|
||
|
|
||
| def is_extra_words_at_valid_positions(license_matches): | ||
| """ | ||
| Return True if any of the matches in `license_matches` that have `extra-words` | ||
| are in the right place. | ||
| """ | ||
| return any(is_extra_words_position_valid(match) for match in license_matches) | ||
|
|
||
|
|
||
| def filter_contained_matches( | ||
| matches, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.