Skip to content

Commit 933063a

Browse files
committed
Fixed #488 - Improve performance for gen_license
* Created new functions to create ABOUT objects without doing any validation such as filename, duplicated keys/rows, file fields etc.. as the only needed information for this `gen_license` is the `license_expression`. Signed-off-by: Chin Yeung Li <[email protected]>
1 parent 9da8e12 commit 933063a

File tree

5 files changed

+83
-14
lines changed

5 files changed

+83
-14
lines changed

docs/source/reference.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,8 @@ Options
413413
--djc api_url api_key Fetch licenses data from DejaCode License
414414
Library and create <license>.LICENSE to the
415415
OUTPUT location.
416-
416+
--scancode Indicate the input JSON file is from
417+
scancode_toolkit.
417418
--verbose Show all the errors and warning.
418419
-h, --help Show this message and exit.
419420
@@ -441,6 +442,12 @@ Details
441442
442443
$ about gen_license --djc 'api_url' 'api_key' LOCATION OUTPUT
443444
445+
--scancode
446+
447+
Indicates the JSON input is from scancode toolkit license detection
448+
449+
$ about gen_license --scancode /home/project/scancode-license-detection.json OUTPUT
450+
444451
--verbose
445452
446453
This option tells the tool to show all errors found.

src/attributecode/cmd.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,9 @@
3636
from attributecode.attrib import DEFAULT_TEMPLATE_FILE, DEFAULT_LICENSE_SCORE
3737
from attributecode.attrib import generate_and_save as generate_attribution_doc
3838
from attributecode.gen import generate as generate_about_files, load_inventory
39-
from attributecode.model import collect_inventory, get_copy_list
39+
from attributecode.model import collect_inventory, collect_abouts_license_expression, collect_inventory_license_expression
4040
from attributecode.model import copy_redist_src
41+
from attributecode.model import get_copy_list
4142
from attributecode.model import pre_process_and_fetch_license_dict
4243
from attributecode.model import write_output
4344
from attributecode.transform import transform_csv_to_csv
@@ -302,12 +303,16 @@ def gen(location, output, android, fetch_license, fetch_license_djc, reference,
302303
metavar='api_url api_key',
303304
help='Fetch licenses from a DejaCode License Library.')
304305

306+
@click.option('--scancode',
307+
is_flag=True,
308+
help='Indicate the input JSON file is from scancode_toolkit.')
309+
305310
@click.option('--verbose',
306311
is_flag=True,
307312
help='Show all error and warning messages.')
308313

309314
@click.help_option('-h', '--help')
310-
def gen_license(location, output, djc, verbose):
315+
def gen_license(location, output, djc, scancode, verbose):
311316
"""
312317
Fetch licenses in the license_expression field and save to the output location.
313318
@@ -316,26 +321,25 @@ def gen_license(location, output, djc, verbose):
316321
OUTPUT: Path to a directory where license files are saved.
317322
"""
318323
print_version()
324+
api_url = ''
325+
api_key = ''
326+
errors = []
319327

320328
if location.endswith('.csv') or location.endswith('.json') or location.endswith('.xlsx'):
321-
_errors, abouts = load_inventory(
322-
location=location
323-
)
329+
abouts = collect_inventory_license_expression(location=location, scancode=scancode)
324330
else:
325-
_errors, abouts = collect_inventory(location)
326-
331+
#_errors, abouts = collect_inventory(location)
332+
errors, abouts = collect_abouts_license_expression(location)
327333

328334
log_file_loc = os.path.join(output, 'error.log')
329-
api_url = ''
330-
api_key = ''
331-
errors = []
332335
if djc:
333336
# Strip the ' and " for api_url, and api_key from input
334337
api_url = djc[0].strip("'").strip('"')
335338
api_key = djc[1].strip("'").strip('"')
336339

337340
click.echo('Fetching licenses...')
338-
license_dict, lic_errors = pre_process_and_fetch_license_dict(abouts, api_url, api_key)
341+
license_dict, lic_errors = pre_process_and_fetch_license_dict(abouts, api_url, api_key, scancode)
342+
339343
if lic_errors:
340344
errors.extend(lic_errors)
341345

src/attributecode/model.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from attributecode import api
4949
from attributecode import Error
5050
from attributecode import saneyaml
51+
from attributecode import gen
5152
from attributecode import util
5253
from attributecode.transform import write_excel
5354
from attributecode.util import add_unc
@@ -1325,6 +1326,60 @@ def collect_inventory(location):
13251326
return unique(errors), abouts
13261327

13271328

1329+
def collect_abouts_license_expression(location):
1330+
"""
1331+
Read the ABOUT files at location and return a list of ABOUT objects without
1332+
validation. The purpose of this is to speed up the process for `gen_license` command.
1333+
"""
1334+
lic_key_list = []
1335+
errors = []
1336+
input_location = util.get_absolute(location)
1337+
about_locations = list(util.get_about_locations(input_location))
1338+
abouts = []
1339+
1340+
for loc in about_locations:
1341+
try:
1342+
loc = add_unc(loc)
1343+
with io.open(loc, encoding='utf-8', errors='replace') as txt:
1344+
input_text = txt.read()
1345+
# saneyaml.load() will have parsing error if the input has
1346+
# tab value. Therefore, we should check if the input contains
1347+
# any tab and then convert it to spaces.
1348+
input = replace_tab_with_spaces(input_text)
1349+
data = saneyaml.load(input, allow_duplicate_keys=False)
1350+
about = About()
1351+
about.load_dict(data, base_dir='')
1352+
abouts.append(about)
1353+
except Exception as e:
1354+
trace = traceback.format_exc()
1355+
msg = 'Cannot load invalid ABOUT file: %(location)r: %(e)r\n%(trace)s'
1356+
errors.append(Error(CRITICAL, msg % locals()))
1357+
1358+
return errors, abouts
1359+
1360+
1361+
def collect_inventory_license_expression(location, scancode=False):
1362+
"""
1363+
Read the inventory file at location and return a list of ABOUT objects without
1364+
validation. The purpose of this is to speed up the process for `gen_license` command.
1365+
"""
1366+
abouts = []
1367+
if scancode:
1368+
inventory = gen.load_scancode_json(location)
1369+
else:
1370+
if location.endswith('.csv'):
1371+
inventory = gen.load_csv(location)
1372+
elif location.endswith('.xlsx'):
1373+
_dup_cols_err, inventory = gen.load_excel(location)
1374+
else:
1375+
inventory = gen.load_json(location)
1376+
for data in inventory:
1377+
about = About()
1378+
about.load_dict(data, base_dir='', scancode=scancode)
1379+
abouts.append(about)
1380+
return abouts
1381+
1382+
13281383
def get_field_names(abouts):
13291384
"""
13301385
Given a list of About objects, return a list of any field names that exist
@@ -1628,7 +1683,10 @@ def pre_process_and_fetch_license_dict(abouts, api_url=None, api_key=None, scanc
16281683
lic_url = url + license_filename
16291684
spdx_license_key = data['spdx_license_key']
16301685
except:
1631-
msg = about.about_file_path + u" : Invalid 'license': " + lic_key
1686+
try:
1687+
msg = about.about_file_path + u" : Invalid 'license': " + lic_key
1688+
except:
1689+
msg = u"Invalid 'license': " + lic_key
16321690
errors.append(Error(ERROR, msg))
16331691
continue
16341692
detail_list.append(license_name)

src/attributecode/util.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,6 @@ def load_scancode_json(location):
652652
"""
653653
Read the scancode JSON file at `location` and return a list of dictionaries.
654654
"""
655-
mapping_dict = {}
656655
updated_results = []
657656

658657
with open(location) as json_file:

tests/testdata/test_cmd/help/about_gen_license_help.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ Usage: about gen-license [OPTIONS] LOCATION OUTPUT
99

1010
Options:
1111
--djc api_url api_key Fetch licenses from a DejaCode License Library.
12+
--scancode Indicate the input JSON file is from scancode_toolkit.
1213
--verbose Show all error and warning messages.
1314
-h, --help Show this message and exit.

0 commit comments

Comments
 (0)