Skip to content

Commit 538648e

Browse files
committed
Remove cache on disk during license detection.
* this has been disabled for a while and was creating troubles for only performance degradations. * premature optimization is the root of all evil. * we still use a file-level cache using a fast implementation that is not license specific. Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 2c55d07 commit 538648e

File tree

20 files changed

+11
-584
lines changed

20 files changed

+11
-584
lines changed

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ def read(*names, **kwargs):
7878
# caching
7979
'zc.lockfile >= 1.0.0, < 2.0.0',
8080
'yg.lockfile >= 2.0.0, < 3.0.0',
81-
'diskcache >= 2.0.0, < 3.0.0',
8281
'psutil >= 5.0.0, < 6.0.0',
8382

8483
# textcode

src/licensedcode/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
root_dir = dirname(src_dir)
4444
cache_dir = join(root_dir, '.cache')
4545
license_index_cache_dir = join(cache_dir, 'license_index')
46-
license_matches_cache_dir = join(cache_dir, 'license_matches')
4746

4847
if not exists(license_index_cache_dir):
4948
fileutils.create_dir(license_index_cache_dir)

src/licensedcode/cache.py

Lines changed: 6 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
2+
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
33
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
44
# The ScanCode software is licensed under the Apache License version 2.0.
55
# Data generated with ScanCode require an acknowledgment.
@@ -24,7 +24,6 @@
2424

2525
from __future__ import absolute_import, print_function
2626

27-
from array import array
2827
from functools import partial
2928
from hashlib import md5
3029
from os.path import exists
@@ -34,25 +33,17 @@
3433

3534
import yg.lockfile # @UnresolvedImport
3635

37-
from commoncode.fileutils import create_dir
3836
from commoncode.fileutils import file_iter
3937
from commoncode import ignore
4038

4139
from licensedcode import src_dir
4240
from licensedcode import license_index_cache_dir
43-
from licensedcode import license_matches_cache_dir
4441

4542

4643
"""
47-
Caching on-disk of LicenseIndex and LicenseMatches:
48-
"""
49-
50-
51-
"""
52-
An on-disk persistent cache of LicenseIndex. The index is pickled and
53-
invalidated if there are any changes in the code or licenses text or rules.
54-
Loading and dumping the cached index is safe to use across multiple processes
55-
using lock files.
44+
An on-disk persistent cache of LicenseIndex. The index is pickled and invalidated if
45+
there are any changes in the code or licenses text or rules. Loading and dumping the
46+
cached index is safe to use across multiple processes using lock files.
5647
"""
5748

5849
index_lock_file = join(license_index_cache_dir, 'lockfile')
@@ -69,8 +60,8 @@ def tree_checksum(base_dir=src_dir, ignored=_ignored_from_hash):
6960
last modified time stamps.
7061
7162
The purpose is to detect is there has been any modification to source code,
72-
compiled code or licenses or rule files and use this as a proxyx to verify
73-
the cache consistency.
63+
compiled code or licenses or rule files and use this as a proxy to verify the
64+
cache consistency.
7465
"""
7566
hashable = [''.join([loc, str(getmtime(loc)), str(getsize(loc))])
7667
for loc in file_iter(base_dir, ignored=_ignored_from_hash)]
@@ -90,9 +81,6 @@ def get_or_build_index_from_cache(force_clear=False):
9081
try:
9182
# acquire lock and wait until timeout to get a lock or die
9283
with yg.lockfile.FileLock(index_lock_file, timeout=LICENSE_INDEX_LOCK_TIMEOUT):
93-
if force_clear:
94-
license_matches_cache.clear(0)
95-
9684
current_checksum = None
9785
# if we have a saved cached index
9886
if exists(tree_checksum_file) and exists(index_cache_file):
@@ -110,10 +98,6 @@ def get_or_build_index_from_cache(force_clear=False):
11098

11199
# Here, the cache is not consistent with the latest code and data:
112100
# It is either stale or non-existing: we need to cleanup/regen
113-
114-
# clear the LicenseMatch cache entirely
115-
license_matches_cache.clear(0)
116-
117101
# regen the index
118102
idx = LicenseIndex(get_rules())
119103
with open(index_cache_file, 'wb') as ifc:
@@ -128,82 +112,3 @@ def get_or_build_index_from_cache(force_clear=False):
128112
except yg.lockfile.FileLockTimeout:
129113
# TODO: unable to lock in a nicer way
130114
raise
131-
132-
133-
"""
134-
A cache of recent matches from queries and query runs.
135-
136-
Several files in the same project or codebase are highly likely have repeated
137-
identical license headers, texts or notices. Another common pattern is multiple
138-
copies of a complete (and possibly long) license text. By caching and returning
139-
the cached matches right away, we can avoid doing the same matching over and
140-
over.
141-
142-
The approach is to use the hash of a sequence of token ids as a cache key either
143-
for a whole query or a query run and to ignore the actual start position.
144-
As values we cache a list of LicenseMatch objects for this sequence of tokens.
145-
146-
When we have a cache hit, the returned cached LicenseMatch are adjusted for
147-
their query and line positions. This way we can have cache hits for the same
148-
sequence of tokens eventually starting at different positions in different
149-
queries.
150-
151-
The cached list of LicenseMatch may be empty: this way we also cache the absence
152-
of matches for a sequence of tokens. This absence of matches can be as costly to
153-
compute initially than an actual matches.
154-
"""
155-
156-
MATCH_CACHE = '0-cached'
157-
158-
159-
class LicenseMatchCache(object):
160-
"""
161-
A file-based cache for license matches.
162-
This is NOT thread-safe, but is multi-process safe.
163-
"""
164-
def __init__(self, cache_dir):
165-
self.cache_dir = cache_dir
166-
create_dir(cache_dir)
167-
from diskcache import Cache as Cache
168-
self.cache = Cache(cache_dir)
169-
170-
def key(self, tokens):
171-
"""
172-
Return a computed cache key for a sequence of query `tokens` numeric ids.
173-
"""
174-
return md5(array('h', tokens).tostring()).hexdigest()
175-
176-
def get(self, query_run):
177-
"""
178-
Return a sequence of cached LicenseMatch if found in the cache or None.
179-
It may return an empty sequence if this was a cached value.
180-
"""
181-
cache_key = self.key(query_run.tokens)
182-
cached = self.cache.get(cache_key)
183-
# either we did not get a hit or we got a hit to nothing (empty list)
184-
# which is a valid cached value
185-
if not cached:
186-
return cached
187-
188-
qrs = query_run.start
189-
qre = query_run.end
190-
return [lm.rebase(qrs, qre, MATCH_CACHE) for lm in cached]
191-
192-
def put(self, query_run, matches):
193-
"""
194-
Cache a license `matches` sequence given a `query run` tokens.
195-
"""
196-
cache_key = self.key(query_run.tokens)
197-
self.cache[cache_key] = matches
198-
return cache_key
199-
200-
def clear(self, *args):
201-
"""
202-
Purge the cache keeping up to `max_size` of the most recently created
203-
entries. If `max_size` is zero, the whole cache is purged.
204-
Raise an exception if a write lock cannot be acquired.
205-
"""
206-
self.cache.clear()
207-
208-
# global cache
209-
license_matches_cache = LicenseMatchCache(cache_dir=license_matches_cache_dir)

src/licensedcode/index.py

Lines changed: 3 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@
4444

4545
from licensedcode.frequent_tokens import global_tokens_by_ranks
4646

47-
from licensedcode.cache import license_matches_cache
48-
from licensedcode.cache import LicenseMatchCache
49-
5047
from licensedcode.match import get_texts
5148
from licensedcode.match import merge_matches
5249
from licensedcode.match import refine_matches
@@ -91,8 +88,6 @@
9188
TRACE_INDEXING_PERF = False
9289
TRACE_INDEXING_CHECK = False
9390

94-
TRACE_CACHE = False
95-
9691

9792
def logger_debug(*args):
9893
pass
@@ -140,9 +135,6 @@ def get_index():
140135
return _LICENSES_INDEX
141136

142137

143-
# Feature switch to use license cache or not (False is used only for testing)
144-
USE_CACHE = False
145-
146138
# Feature switch to enable or not ngram fragments detection
147139
USE_AHO_FRAGMENTS = False
148140

@@ -451,13 +443,13 @@ def debug_matches(self, matches, message, location=None, query_string=None, with
451443
print(it)
452444
print()
453445

454-
def match(self, location=None, query_string=None, min_score=0, detect_negative=True, use_cache=USE_CACHE):
446+
def match(self, location=None, query_string=None, min_score=0, detect_negative=True):
455447
"""
456448
Return a sequence of LicenseMatch by matching the file at `location` or
457449
the `query_string` text against the index. Only include matches with
458450
scores greater or equal to `min_score`.
459451
460-
`detect_negative` and `use_cache` are for testing purpose only.
452+
`detect_negative` is for testing purpose only.
461453
"""
462454
assert 0 <= min_score <= 100
463455

@@ -474,35 +466,13 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
474466
return []
475467

476468
#######################################################################
477-
# Whole file matching: hash, cache and exact matching
469+
# Whole file matching: hash, negative and exact matching
478470
#######################################################################
479471
whole_query_run = qry.whole_query_run()
480472
if not whole_query_run or not whole_query_run.matchables:
481473
logger_debug('#match: whole query not matchable')
482474
return []
483475

484-
if use_cache:
485-
if use_cache is True:
486-
matches_cache = license_matches_cache
487-
else:
488-
# NOTE: this weird "if" is only for cache testing when use_cache
489-
# can contain a temp test cache_dir path and is not True
490-
matches_cache = LicenseMatchCache(cache_dir=use_cache)
491-
492-
# check cache
493-
if use_cache:
494-
cached_matches = matches_cache.get(whole_query_run)
495-
496-
if cached_matches is not None:
497-
if cached_matches:
498-
if TRACE_CACHE: self.debug_matches(cached_matches, '#match FINAL cache matched', location, query_string)
499-
# FIXME: should we filter and refine here?
500-
return cached_matches
501-
else:
502-
# cached but empty matches
503-
if TRACE_CACHE: self.debug_matches([], '#match FINAL cache matched to NOTHING', location, query_string)
504-
return []
505-
506476
# hash
507477
hash_matches = match_hash(self, whole_query_run)
508478
if hash_matches:
@@ -561,19 +531,8 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
561531
if hash_matches:
562532
if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string)
563533
matches.extend(hash_matches)
564-
# note that we do not cache hash matches
565534
continue
566535

567-
# cache short circuit
568-
#########################
569-
if use_cache:
570-
cached_matches = matches_cache.get(query_run)
571-
if cached_matches is not None:
572-
if TRACE_CACHE: self.debug_matches(cached_matches, ' #match Query run matches (cached)', location, query_string)
573-
if cached_matches:
574-
matches.extend(cached_matches)
575-
continue
576-
577536
# query run match proper using sequence matching
578537
#########################################
579538
if TRACE: logger_debug(' #match: Query run MATCHING proper....')
@@ -609,11 +568,6 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
609568

610569
if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string)
611570

612-
if use_cache:
613-
# always CACHE even and especially if no matches were found
614-
if TRACE_CACHE: self.debug_matches(run_matches, ' #match: Query run matches caching', location, query_string)
615-
matches_cache.put(query_run, run_matches)
616-
617571
# final matching merge, refinement and filtering
618572
################################################
619573
if matches:
@@ -631,11 +585,6 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
631585

632586
self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True)
633587

634-
if use_cache:
635-
# always CACHE at the whole query ruk level even and especially if no matches were found: here whole query
636-
self.debug_matches(matches, '#match: Caching Final matches', location, query_string)
637-
matches_cache.put(whole_query_run, matches)
638-
639588
return matches
640589

641590
def negative_match(self, query_run):

src/licensedcode/match.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636

3737
from licensedcode import query
3838
from licensedcode.spans import Span
39-
from licensedcode import cache
4039
from licensedcode import MAX_DIST
4140

4241
"""
@@ -326,21 +325,6 @@ def update(self, other):
326325
self.query_run_start = min(self.query_run_start, other.query_run_start)
327326
return self
328327

329-
def rebase(self, new_query_start, new_query_end, matcher):
330-
"""
331-
Return a copy of this match with a new qspan updating the matcher of this
332-
copied match as needed.
333-
"""
334-
offset = new_query_start - self.query_run_start
335-
return LicenseMatch(
336-
rule=self.rule,
337-
qspan=self.qspan.rebase(offset),
338-
ispan=Span(self.ispan),
339-
hispan=Span(self.hispan),
340-
query_run_start=new_query_start,
341-
matcher=' '.join([self.matcher.replace(cache.MATCH_CACHE, '').strip(), matcher]),
342-
)
343-
344328
def small(self):
345329
"""
346330
Return True if this match is "small" based on its rule thresholds.

src/licensedcode/spans.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -419,23 +419,6 @@ def distance_to(self, other):
419419
else:
420420
return self.start - other.end
421421

422-
def rebase(self, offset):
423-
"""
424-
Return a copy of this span adding `offset` to each item
425-
426-
For example:
427-
>>> Span([4, 5]).rebase(0)
428-
Span(4, 5)
429-
>>> Span(4, 5).rebase(1)
430-
Span(5, 6)
431-
>>> Span([4, 5]).rebase(3)
432-
Span(7, 8)
433-
>>> Span([1, 4, 5, 8, 9]).rebase(5)
434-
Span(6)|Span(9, 10)|Span(13, 14)
435-
"""
436-
assert self.start + offset >= 0
437-
return Span([i + offset for i in self._set])
438-
439422
@staticmethod
440423
def from_ints(ints):
441424
"""

src/scancode/api.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ def get_licenses(location, min_score=0, diag=False):
120120
idx = get_index()
121121
licenses = licenses_details()
122122

123-
# note: we do USE the cache here
124-
for match in idx.match(location=location, min_score=min_score, use_cache=False):
123+
for match in idx.match(location=location, min_score=min_score):
125124
for license_key in match.rule.licenses:
126125
lic = licenses.get(license_key)
127126
result = OrderedDict()

tests/licensedcode/data/cache/plain/bsd-new

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)