|
38 | 38 | from licensedcode.frontmatter import dumps_frontmatter |
39 | 39 | from licensedcode.frontmatter import load_frontmatter |
40 | 40 | from licensedcode.languages import LANG_INFO as known_languages |
| 41 | +from licensedcode.stopwords import STOPWORDS |
41 | 42 | from licensedcode.tokenize import get_existing_required_phrase_spans |
42 | 43 | from licensedcode.tokenize import index_tokenizer |
43 | 44 | from licensedcode.tokenize import index_tokenizer_with_stopwords |
@@ -1691,7 +1692,6 @@ class BasicRule: |
1691 | 1692 | ) |
1692 | 1693 | ) |
1693 | 1694 |
|
1694 | | - |
1695 | 1695 | # These thresholds attributes are computed upon text loading or calling the |
1696 | 1696 | # thresholds function explicitly |
1697 | 1697 | ########################################################################### |
@@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False): |
1960 | 1960 | if not is_false_positive: |
1961 | 1961 | if self.relevance == 0 and not self.is_deprecated: |
1962 | 1962 | yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule' |
1963 | | - |
| 1963 | + |
1964 | 1964 | if not (0 <= self.minimum_coverage <= 100): |
1965 | 1965 | yield 'Invalid rule minimum_coverage. Should be between 0 and 100.' |
1966 | 1966 |
|
@@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False): |
1994 | 1994 | if self.is_generic(licenses_by_key=get_licenses_db()): |
1995 | 1995 | yield 'is_required_phrase rule cannot be a generic license.' |
1996 | 1996 |
|
| 1997 | + # no stopwords in short rules! or else exact matching is not accurate |
| 1998 | + stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6) |
| 1999 | + if stops_in_rule: |
| 2000 | + sw = sorted(stops_in_rule) |
| 2001 | + yield f'Short is_required_phrase rule cannot contain stopwords: {sw}' |
| 2002 | + |
1997 | 2003 | if not license_expression: |
1998 | 2004 | yield 'Missing license_expression.' |
1999 | 2005 | else: |
@@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False): |
2024 | 2030 | if self.is_deprecated and not self.replaced_by and not self.relevance == 0: |
2025 | 2031 | yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0' |
2026 | 2032 |
|
2027 | | - |
2028 | 2033 | if thorough: |
2029 | 2034 | text = self.text |
2030 | 2035 | data = {"text": text} |
@@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False): |
2206 | 2211 | return data |
2207 | 2212 |
|
2208 | 2213 |
|
| 2214 | +def get_stopwords_in_short_text(text, min_tokens=4): |
| 2215 | + """ |
| 2216 | + Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains |
| 2217 | + STOPWORDS or None. |
| 2218 | + Stopwords in short texts may make exact matching inaccurate. |
| 2219 | + """ |
| 2220 | + tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False)) |
| 2221 | + if len(tokens) < min_tokens: |
| 2222 | + tokens = set(tokens) |
| 2223 | + return tokens.intersection(STOPWORDS) |
| 2224 | + |
| 2225 | + |
2209 | 2226 | def has_only_lower_license_keys(license_expression, licensing=Licensing()): |
2210 | 2227 | """ |
2211 | 2228 | Return True if all license keys of ``license_expression`` are lowercase. |
@@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE): |
2377 | 2394 | self.is_small = self.length < small_rule |
2378 | 2395 | self.is_tiny = self.length < tiny_rule |
2379 | 2396 |
|
2380 | | - |
2381 | 2397 | def dump(self, rules_data_dir, **kwargs): |
2382 | 2398 | """ |
2383 | 2399 | Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8 |
|
0 commit comments