aboutcode-org
diff --git a/‎etc/scripts/licenses/buildrules.py‎
Lines changed: 1 addition & 1 deletion b/‎etc/scripts/licenses/buildrules.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/cluecode/copyrights.py‎
Lines changed: 40 additions & 15 deletions b/‎src/cluecode/copyrights.py‎
Lines changed: 40 additions & 15 deletions
diff --git a/‎src/cluecode/plugin_filter_clues.py‎
Lines changed: 22 additions & 20 deletions b/‎src/cluecode/plugin_filter_clues.py‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎src/licensedcode/data/licenses/bitstream.yml‎
Lines changed: 1 addition & 2 deletions b/‎src/licensedcode/data/licenses/bitstream.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/licensedcode/data/licenses/xfree86-1.0.yml‎
Lines changed: 1 addition & 2 deletions b/‎src/licensedcode/data/licenses/xfree86-1.0.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/licensedcode/data/licenses/xfree86-1.1.yml‎
Lines changed: 0 additions & 1 deletion b/‎src/licensedcode/data/licenses/xfree86-1.1.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/licensedcode/data/rules/other-permissive_181.yml‎
Lines changed: 0 additions & 1 deletion b/‎src/licensedcode/data/rules/other-permissive_181.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/licensedcode/data/rules/x11-opengroup_1.yml‎
Lines changed: 1 addition & 2 deletions b/‎src/licensedcode/data/rules/x11-opengroup_1.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/licensedcode/data/rules/x11-opengroup_2.yml‎
Lines changed: 1 addition & 2 deletions b/‎src/licensedcode/data/rules/x11-opengroup_2.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/licensedcode/data/rules/x11-opengroup_4.yml‎
Lines changed: 1 addition & 2 deletions b/‎src/licensedcode/data/rules/x11-opengroup_4.yml‎
Lines changed: 1 addition & 2 deletions
@@ -235,8 +235,8 @@ def cli(licenses_file):
             print('Skipping already added rule with text for:', base_name)
         else:
             rules_tokens.add(rule_tokens)
-            rulerec.dump()
             models.update_ignorables(rulerec, verbose=False)
+            rulerec.dump()
             print('Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file,)
 
 
 
@@ -25,13 +25,15 @@
 if os.environ.get('SCANCODE_DEBUG_COPYRIGHT_DEEP'):
     TRACE_DEEP = 1
 
+TRACE_TOK = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT_TOKEN', False)
+
 
 # Tracing flags
 def logger_debug(*args):
     pass
 
 
-if TRACE or TRACE_DEEP:
+if TRACE or TRACE_DEEP or TRACE_TOK:
     import logging
 
     logger = logging.getLogger(__name__)
@@ -98,7 +100,18 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
     """
     detector = CopyrightDetector()
 
-    for candidates in candidate_lines(numbered_lines):
+    candidate_lines_groups = candidate_lines(numbered_lines)
+    if TRACE:
+        candidate_lines_groups = list(candidate_lines_groups)
+        logger_debug(
+            f'detect_copyrights_from_lines: ALL groups of candidate '
+            f'lines collected: {len(candidate_lines_groups)}')
+
+    for candidates in candidate_lines_groups:
+        if TRACE:
+            from pprint import pformat
+            can = pformat(candidates, width=160)
+            logger_debug(f' detect_copyrights_from_lines: processing candidates group:\n{can}')
 
         detections = detector.detect(
             numbered_lines=candidates,
@@ -109,13 +122,17 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
             include_allrights=include_allrights
         )
 
+        if TRACE:
+            detections = list(detections)
+            logger_debug(f' detect_copyrights_from_lines: {detections}')
+
         for detection in detections:
             # tuple of type, string, start, end
             yield detection
+
         if time() > deadline:
             break
 
-
 ################################################################################
 # DETECTION PROPER
 ################################################################################
@@ -149,8 +166,15 @@ def detect(self, numbered_lines,
         numbered_lines = list(numbered_lines)
         start_line = numbered_lines[0][0]
         end_line = numbered_lines[-1][0]
+
+        if TRACE: logger_debug(f'CopyrightDetector:numbered_lines: {numbered_lines}')
+
         tokens = self.get_tokens(numbered_lines)
 
+        if TRACE:
+            tokens = list(tokens)
+            logger_debug(f'CopyrightDetector:tokens: {tokens}')
+
         if not tokens:
             return
 
@@ -242,9 +266,9 @@ def get_tokens(self, numbered_lines):
         tokens_append = tokens.append
 
         for _line_number, line in numbered_lines:
-            if TRACE: logger_debug('  get_tokens:  bare line: ' + repr(line))
+            if TRACE_TOK: logger_debug('  get_tokens:  bare line: ' + repr(line))
             line = prepare_text_line(line)
-            if TRACE: logger_debug('  get_tokens:preped line: ' + repr(line))
+            if TRACE_TOK: logger_debug('  get_tokens:preped line: ' + repr(line))
             for tok in splitter(line):
                 # strip trailing single quotes and ignore empties
                 tok = tok.strip("' ")
@@ -254,7 +278,9 @@ def get_tokens(self, numbered_lines):
                 tok = tok.lstrip('@').strip()
                 if tok and tok not in (':',):
                     tokens_append(tok)
-        if TRACE: logger_debug('  get_tokens:tokens: ' + repr(tokens))
+        if TRACE_TOK:
+            logger_debug('  get_tokens:tokens: ' + repr(tokens))
+            logger_debug('  get_tokens: ALL tokens collected')
         return tokens
 
     @classmethod
@@ -442,7 +468,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
 
     # CamELCaseeXXX is typcally JUNK such as code variable names
     # AzaAzaaaAz BBSDSB002923,
-    (r'^([A-Z][a-z]+){3,}[A-Z]+[0-9]*,?$', 'JUNK'),
+    (r'^([A-Z][a-z]+){3,20}[A-Z]+[0-9]*,?$', 'JUNK'),
 
     # multiple parens (at least two (x) groups) is a sign of junk
     # such as in (1)(ii)(OCT
@@ -501,7 +527,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
     (r'^Last$', 'JUNK'),
     (r'^[Rr]eleased?$', 'JUNK'),
     (r'^[Cc]opyrighting$', 'JUNK'),
-    (r'^Authori.*$', 'JUNK'),
+    (r'^[Aa]uthori.*$', 'JUNK'),
     (r'^such$', 'JUNK'),
     (r'^[Aa]ssignments?[.,]?$', 'JUNK'),
     (r'^[Bb]uild$', 'JUNK'),
@@ -805,7 +831,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
     (r'^Assignment', 'NN'),
     (r'^Atomic$', 'NN'),
     (r'^Attribution$', 'NN'),
-    (r'^Authored$', 'NN'),
+    (r'^[Aa]uthored$', 'NN'),
     (r'^Baslerstr\.?$', 'NN'),
     (r'^BSD$', 'NN'),
     (r'^BUT$', 'NN'),
@@ -1446,8 +1472,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
     ############################################################################
 
      # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
-    (r'^<([a-zA-Z]+[a-zA-Z\.]){3,}$', 'EMAIL_START'),
-    (r'^[a-zA-Z\.]{2,}>$', 'EMAIL_END'),
+    (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
+    (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
 
     # a .sh shell scripts is NOT an email.
     (r'^.*\.sh\.?$', 'JUNK'),
@@ -3037,9 +3063,9 @@ def is_end_of_statement(chars_only_line):
 
 def candidate_lines(numbered_lines):
     """
-    Yield lists of candidate lines where each list element is a tuple of
-    (line number,  line text) given an iterable of numbered_lines as tuples of
-    (line number,  line text) .
+    Yield groups of candidate lines as list where each list element is a tuple
+    of (line number,  line text) given an iterable of numbered_lines as tuples
+    of (line number,  line text) .
 
     A candidate line is a line of text that may contain copyright statements.
     A few lines before and after a candidate line are also included.
@@ -3082,7 +3108,6 @@ def candidate_lines(numbered_lines):
             previous_chars = chars_only
             if TRACE: logger_debug('   candidate_lines: line is candidate')
 
-
         elif 's>' in line:
             # this is for debian-style <s></s> copyright name tags
             # the state is now "in copyright"
 
@@ -38,8 +38,8 @@ def logger_debug(*args):
 @post_scan_impl
 class RedundantCluesFilter(PostScanPlugin):
     """
-    Filter redundant clues (copyrights, authors, emails, and urls) that are already
-    contained in another more important scan result.
+    Filter redundant clues (copyrights, authors, emails, and urls) that are
+    already contained in a matched license text.
     """
     sort_order = 1
 
@@ -275,13 +275,14 @@ def filter_ignorable_clues(detections, rules_by_id):
 
 def filter_values(attributes, ignorables, value_key='value', strip=''):
     """
-    Yield filtered `attributes` based on line positions and values found in a
-    ignorables.
+    Yield filtered ``attributes`` based on line positions and values found in a
+    ``ignorables`` Ignorables object. Use the ``value_key`` key for getting the
+    value.
 
     `attributes` is a list of mappings that contain a `start_line`, `end_line`
     and a `value_key` key.
 
-    Optionally strip `strip` from the the values.
+    Optionally strip the ``strip`` characters from the values.
     """
     for item in attributes:
         if TRACE:
@@ -296,7 +297,7 @@ def filter_values(attributes, ignorables, value_key='value', strip=''):
 
         for ign in ignorables:
             if TRACE: logger_debug('   filter_values: ign:', ign)
-            if (ls in ign.lines_range or el in ign.lines_range) and val in ign.value:
+            if (ls in ign.lines_range or el in ign.lines_range)  and val in ign.value:
                 ignored = True
                 if TRACE: logger_debug('   filter_values: skipped')
                 break
@@ -307,13 +308,12 @@ def filter_values(attributes, ignorables, value_key='value', strip=''):
 
 def collect_ignorables(license_matches, rules_by_id):
     """
-    Collect and return an ignorable Clues object built from `license_matches`
-    matched licenses which is the list of "licenses" objects returned in JSON
-    results.
+    Collect and return an Ignorables object built from ``license_matches``
+    matched licenses list of "licenses" objects returned in ScanCode JSON
+    results and the ``rules_by_id`` mapping of Rule objects by identifier.
 
-    The value of each ignorable list of clues is a set of (set of
-    lines number, set of ignorable values). The return values is a mapping
-    {label: ignorables}.
+    The value of each ignorable list of clues is a set of (set of lines number,
+    set of ignorable values).
     """
     emails = set()
     urls = set()
@@ -323,16 +323,17 @@ def collect_ignorables(license_matches, rules_by_id):
 
     if not license_matches:
         return Ignorables(
-        copyrights=frozenset(copyrights),
-        holders=frozenset(holders),
-        authors=frozenset(authors),
-        urls=frozenset(urls),
-        emails=frozenset(emails),
-    )
+            copyrights=frozenset(copyrights),
+            holders=frozenset(holders),
+            authors=frozenset(authors),
+            urls=frozenset(urls),
+            emails=frozenset(emails),
+        )
     # build tuple of (set of lines number, set of ignorbale values)
     for lic in license_matches:
 
-        if TRACE: logger_debug('collect_ignorables: license:', lic['key'], lic['score'])
+        if TRACE:
+            logger_debug('collect_ignorables: license:', lic['key'], lic['score'])
 
         matched_rule = lic.get('matched_rule', {})
         rid = matched_rule.get('identifier')
@@ -341,7 +342,8 @@ def collect_ignorables(license_matches, rules_by_id):
         # ignore poor partial matches
         # TODO: there must be a better way using coverage
         if match_coverage < 90:
-            if TRACE: logger_debug('  collect_ignorables: skipping, match_coverage under 90%')
+            if TRACE:
+                logger_debug('  collect_ignorables: skipping, match_coverage under 90%')
             continue
 
         if not rid:
 
@@ -8,5 +8,4 @@ spdx_license_key: LicenseRef-scancode-bitstream
 text_urls:
     - http://www.gnome.org/fonts/#Final_Bitstream_Vera_Fonts
 minimum_coverage: 50
-ignorable_authors:
-    - authorization from the Gnome Foundation or Bitstream Inc.
+
@@ -8,5 +8,4 @@ spdx_license_key: LicenseRef-scancode-xfree86-1.0
 text_urls:
     - http://www.xfree86.org/current/LICENSE5.html
 minimum_coverage: 80
-ignorable_authors:
-    - authorization from the XFree86 Project
+
@@ -11,6 +11,5 @@ faq_url: http://www.xfree86.org/current/LICENSE4.html
 minimum_coverage: 70
 ignorable_authors:
     - The XFree86 Project, Inc (http://www.xfree86.org/)
-    - authorization from The XFree86 Project, Inc.
 ignorable_urls:
     - http://www.xfree86.org/
@@ -5,6 +5,5 @@ notes: Seen in https://github.com/apple-opensource-mirror/X11/blob/43e6e6c55db66
     This is a mix of Apache-1.1 and xfree86-1.1
 ignorable_authors:
     - X-Oz Technologies (http://www.x-oz.com/)
-    - authorization from X-Oz Technologies
 ignorable_urls:
     - http://www.x-oz.com/
@@ -1,5 +1,4 @@
 license_expression: x11-opengroup
 is_license_text: yes
 minimum_coverage: 50
-ignorable_authors:
-    - authorization from The Open Group
+
@@ -3,5 +3,4 @@ is_license_text: yes
 relevance: 90
 minimum_coverage: 90
 notes: truncated text
-ignorable_authors:
-    - authorization from The Open Group
+
@@ -1,5 +1,4 @@
 license_expression: x11-opengroup
 is_license_text: yes
 relevance: 100
-ignorable_authors:
-    - authorization from The Open Group
+