aboutcode-org
diff --git a/‎setup.py‎
Lines changed: 0 additions & 1 deletion b/‎setup.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/licensedcode/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎src/licensedcode/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/licensedcode/cache.py‎
Lines changed: 6 additions & 101 deletions b/‎src/licensedcode/cache.py‎
Lines changed: 6 additions & 101 deletions
diff --git a/‎src/licensedcode/index.py‎
Lines changed: 3 additions & 54 deletions b/‎src/licensedcode/index.py‎
Lines changed: 3 additions & 54 deletions
diff --git a/‎src/licensedcode/match.py‎
Lines changed: 0 additions & 16 deletions b/‎src/licensedcode/match.py‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎src/licensedcode/spans.py‎
Lines changed: 0 additions & 17 deletions b/‎src/licensedcode/spans.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎src/scancode/api.py‎
Lines changed: 1 addition & 2 deletions b/‎src/scancode/api.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/licensedcode/data/cache/plain/bsd-new‎
Lines changed: 0 additions & 7 deletions b/‎tests/licensedcode/data/cache/plain/bsd-new‎
Lines changed: 0 additions & 7 deletions
@@ -78,7 +78,6 @@ def read(*names, **kwargs):
         # caching
         'zc.lockfile >= 1.0.0, < 2.0.0',
         'yg.lockfile >= 2.0.0, < 3.0.0',
-        'diskcache >= 2.0.0, < 3.0.0',
         'psutil >= 5.0.0, < 6.0.0',
 
         # textcode
 
@@ -43,7 +43,6 @@
 root_dir = dirname(src_dir)
 cache_dir = join(root_dir, '.cache')
 license_index_cache_dir = join(cache_dir, 'license_index')
-license_matches_cache_dir = join(cache_dir, 'license_matches')
 
 if not exists(license_index_cache_dir):
     fileutils.create_dir(license_index_cache_dir)
 
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
 # http://nexb.com and https://github.com/nexB/scancode-toolkit/
 # The ScanCode software is licensed under the Apache License version 2.0.
 # Data generated with ScanCode require an acknowledgment.
@@ -24,7 +24,6 @@
 
 from __future__ import absolute_import, print_function
 
-from array import array
 from functools import partial
 from hashlib import md5
 from os.path import exists
@@ -34,25 +33,17 @@
 
 import yg.lockfile  # @UnresolvedImport
 
-from commoncode.fileutils import create_dir
 from commoncode.fileutils import file_iter
 from commoncode import ignore
 
 from licensedcode import src_dir
 from licensedcode import license_index_cache_dir
-from licensedcode import license_matches_cache_dir
 
 
 """
-Caching on-disk  of LicenseIndex and LicenseMatches:
-"""
-
-
-"""
-An on-disk persistent cache of LicenseIndex. The index is pickled and
-invalidated if there are any changes in the code or licenses text or rules.
-Loading and dumping the cached index is safe to use across multiple processes
-using lock files.
+An on-disk persistent cache of LicenseIndex. The index is pickled and invalidated if
+there are any changes in the code or licenses text or rules. Loading and dumping the
+cached index is safe to use across multiple processes using lock files.
 """
 
 index_lock_file = join(license_index_cache_dir, 'lockfile')
@@ -69,8 +60,8 @@ def tree_checksum(base_dir=src_dir, ignored=_ignored_from_hash):
     last modified time stamps.
 
     The purpose is to detect is there has been any modification to source code,
-    compiled code or licenses or rule files and use this as a proxyx to verify
-    the cache consistency.
+    compiled code or licenses or rule files and use this as a proxy to verify the
+    cache consistency.
     """
     hashable = [''.join([loc, str(getmtime(loc)), str(getsize(loc))])
                 for loc in file_iter(base_dir, ignored=_ignored_from_hash)]
@@ -90,9 +81,6 @@ def get_or_build_index_from_cache(force_clear=False):
     try:
         # acquire lock and wait until timeout to get a lock or die
         with yg.lockfile.FileLock(index_lock_file, timeout=LICENSE_INDEX_LOCK_TIMEOUT):
-            if force_clear:
-                license_matches_cache.clear(0)
-
             current_checksum = None
             # if we have a saved cached index
             if exists(tree_checksum_file) and exists(index_cache_file):
@@ -110,10 +98,6 @@ def get_or_build_index_from_cache(force_clear=False):
 
             # Here, the cache is not consistent with the latest code and data:
             # It is either stale or non-existing: we need to cleanup/regen
-
-            # clear the LicenseMatch cache entirely
-            license_matches_cache.clear(0)
-
             # regen the index
             idx = LicenseIndex(get_rules())
             with open(index_cache_file, 'wb') as ifc:
@@ -128,82 +112,3 @@ def get_or_build_index_from_cache(force_clear=False):
     except yg.lockfile.FileLockTimeout:
         # TODO: unable to lock in a nicer way
         raise
-
-
-"""
-A cache of recent matches from queries and query runs.
-
-Several files in the same project or codebase are highly likely have repeated
-identical license headers, texts or notices. Another common pattern is multiple
-copies of a complete (and possibly long) license text. By caching and returning
-the cached matches right away, we can avoid doing the same matching over and
-over.
-
-The approach is to use the hash of a sequence of token ids as a cache key either
-for a whole query or a query run and to ignore the actual start position.
-As values we cache a list of LicenseMatch objects for this sequence of tokens.
-
-When we have a cache hit, the returned cached LicenseMatch are adjusted for
-their query and line positions. This way we can have cache hits for the same
-sequence of tokens eventually starting at different positions in different
-queries.
-
-The cached list of LicenseMatch may be empty: this way we also cache the absence
-of matches for a sequence of tokens. This absence of matches can be as costly to
-compute initially than an actual matches.
-"""
-
-MATCH_CACHE = '0-cached'
-
-
-class LicenseMatchCache(object):
-    """
-    A file-based cache for license matches.
-    This is NOT thread-safe, but is multi-process safe.
-    """
-    def __init__(self, cache_dir):
-        self.cache_dir = cache_dir
-        create_dir(cache_dir)
-        from diskcache import Cache as Cache
-        self.cache = Cache(cache_dir)
-
-    def key(self, tokens):
-        """
-        Return a computed cache key for a sequence of query `tokens` numeric ids.
-        """
-        return md5(array('h', tokens).tostring()).hexdigest()
-
-    def get(self, query_run):
-        """
-        Return a sequence of cached LicenseMatch if found in the cache or None.
-        It may return an empty sequence if this was a cached value.
-        """
-        cache_key = self.key(query_run.tokens)
-        cached = self.cache.get(cache_key)
-        # either we did not get a hit or we got a hit to nothing (empty list)
-        # which is a valid cached value
-        if not cached:
-            return cached
-
-        qrs = query_run.start
-        qre = query_run.end
-        return [lm.rebase(qrs, qre, MATCH_CACHE) for lm in cached]
-
-    def put(self, query_run, matches):
-        """
-        Cache a license `matches` sequence given a `query run` tokens.
-        """
-        cache_key = self.key(query_run.tokens)
-        self.cache[cache_key] = matches
-        return cache_key
-
-    def clear(self, *args):
-        """
-        Purge the cache keeping up to `max_size` of the most recently created
-        entries. If `max_size` is zero, the whole cache is purged.
-        Raise an exception if a write lock cannot be acquired.
-        """
-        self.cache.clear()
-
-# global cache
-license_matches_cache = LicenseMatchCache(cache_dir=license_matches_cache_dir)
@@ -44,9 +44,6 @@
 
 from licensedcode.frequent_tokens import global_tokens_by_ranks
 
-from licensedcode.cache import license_matches_cache
-from licensedcode.cache import LicenseMatchCache
-
 from licensedcode.match import get_texts
 from licensedcode.match import merge_matches
 from licensedcode.match import refine_matches
@@ -91,8 +88,6 @@
 TRACE_INDEXING_PERF = False
 TRACE_INDEXING_CHECK = False
 
-TRACE_CACHE = False
-
 
 def logger_debug(*args):
     pass
@@ -140,9 +135,6 @@ def get_index():
     return _LICENSES_INDEX
 
 
-# Feature switch to use license cache or not (False is used only for testing)
-USE_CACHE = False
-
 # Feature switch to enable or not ngram fragments detection
 USE_AHO_FRAGMENTS = False
 
@@ -451,13 +443,13 @@ def debug_matches(self, matches, message, location=None, query_string=None, with
                     print(it)
                     print()
 
-    def match(self, location=None, query_string=None, min_score=0, detect_negative=True, use_cache=USE_CACHE):
+    def match(self, location=None, query_string=None, min_score=0, detect_negative=True):
         """
         Return a sequence of LicenseMatch by matching the file at `location` or
         the `query_string` text against the index. Only include matches with
         scores greater or equal to `min_score`.
 
-        `detect_negative` and `use_cache` are for testing purpose only.
+        `detect_negative` is for testing purpose only.
         """
         assert 0 <= min_score <= 100
 
@@ -474,35 +466,13 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
             return []
 
         #######################################################################
-        # Whole file matching: hash, cache  and exact matching
+        # Whole file matching: hash, negative and exact matching
         #######################################################################
         whole_query_run = qry.whole_query_run()
         if not whole_query_run or not whole_query_run.matchables:
             logger_debug('#match: whole query not matchable')
             return []
 
-        if use_cache:
-            if use_cache is True:
-                matches_cache = license_matches_cache
-            else:
-                # NOTE: this weird "if" is only for cache testing when use_cache
-                # can contain a temp test cache_dir path and is not True
-                matches_cache = LicenseMatchCache(cache_dir=use_cache)
-
-        # check cache
-        if use_cache:
-            cached_matches = matches_cache.get(whole_query_run)
-
-            if cached_matches is not None:
-                if cached_matches:
-                    if TRACE_CACHE: self.debug_matches(cached_matches, '#match FINAL cache matched', location, query_string)
-                    # FIXME: should we filter and refine here?
-                    return cached_matches
-                else:
-                    # cached but empty matches
-                    if TRACE_CACHE: self.debug_matches([], '#match FINAL cache matched to NOTHING', location, query_string)
-                    return []
-
         # hash
         hash_matches = match_hash(self, whole_query_run)
         if hash_matches:
@@ -561,19 +531,8 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
                 if hash_matches:
                     if TRACE: self.debug_matches(hash_matches, '  #match Query run matches (hash)', location, query_string)
                     matches.extend(hash_matches)
-                    # note that we do not cache hash matches
                     continue
 
-                # cache short circuit
-                #########################
-                if use_cache:
-                    cached_matches = matches_cache.get(query_run)
-                    if cached_matches is not None:
-                        if TRACE_CACHE: self.debug_matches(cached_matches, '  #match Query run matches (cached)', location, query_string)
-                        if cached_matches:
-                            matches.extend(cached_matches)
-                        continue
-
                 # query run match proper using sequence matching
                 #########################################
                 if TRACE: logger_debug('  #match: Query run MATCHING proper....')
@@ -609,11 +568,6 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
 
                 if TRACE: self.debug_matches(run_matches, '     #match: Query run matches merged', location, query_string)
 
-                if use_cache:
-                    # always CACHE even and especially if no matches were found
-                    if TRACE_CACHE: self.debug_matches(run_matches, ' #match: Query run matches caching', location, query_string)
-                    matches_cache.put(query_run, run_matches)
-
         # final matching merge, refinement and filtering
         ################################################
         if matches:
@@ -631,11 +585,6 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
 
         self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True)
 
-        if use_cache:
-            # always CACHE at the whole query ruk level even and especially if no matches were found: here whole query
-            self.debug_matches(matches, '#match: Caching Final matches', location, query_string)
-            matches_cache.put(whole_query_run, matches)
-
         return matches
 
     def negative_match(self, query_run):
 
@@ -36,7 +36,6 @@
 
 from licensedcode import query
 from licensedcode.spans import Span
-from licensedcode import cache
 from licensedcode import MAX_DIST
 
 """
@@ -326,21 +325,6 @@ def update(self, other):
         self.query_run_start = min(self.query_run_start, other.query_run_start)
         return self
 
-    def rebase(self, new_query_start, new_query_end, matcher):
-        """
-        Return a copy of this match with a new qspan updating the matcher of this
-        copied match as needed.
-        """
-        offset = new_query_start - self.query_run_start
-        return LicenseMatch(
-            rule=self.rule,
-            qspan=self.qspan.rebase(offset),
-            ispan=Span(self.ispan),
-            hispan=Span(self.hispan),
-            query_run_start=new_query_start,
-            matcher=' '.join([self.matcher.replace(cache.MATCH_CACHE, '').strip(), matcher]),
-        )
-
     def small(self):
         """
         Return True if this match is "small" based on its rule thresholds.
 
@@ -419,23 +419,6 @@ def distance_to(self, other):
         else:
             return self.start - other.end
 
-    def rebase(self, offset):
-        """
-        Return a copy of this span adding `offset` to each item
-
-        For example:
-        >>> Span([4, 5]).rebase(0)
-        Span(4, 5)
-        >>> Span(4, 5).rebase(1)
-        Span(5, 6)
-        >>> Span([4, 5]).rebase(3)
-        Span(7, 8)
-        >>> Span([1, 4, 5, 8, 9]).rebase(5)
-        Span(6)|Span(9, 10)|Span(13, 14)
-        """
-        assert self.start + offset >= 0
-        return Span([i + offset for i in self._set])
-
     @staticmethod
     def from_ints(ints):
         """
 
@@ -120,8 +120,7 @@ def get_licenses(location, min_score=0, diag=False):
     idx = get_index()
     licenses = licenses_details()
 
-    # note: we do USE the cache here
-    for match in idx.match(location=location, min_score=min_score, use_cache=False):
+    for match in idx.match(location=location, min_score=min_score):
         for license_key in match.rule.licenses:
             lic = licenses.get(license_key)
             result = OrderedDict()