Skip to content

Commit 536f354

Browse files
authored
Merge pull request #786 from nexB/775-npm-error
Fix npm package.json parsing errors, improve parsing
2 parents ba5be09 + b0ae573 commit 536f354

File tree

18 files changed

+5873
-385
lines changed

18 files changed

+5873
-385
lines changed

src/packagedcode/models.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@
123123
"""
124124

125125

126-
class ListType(ListType):
126+
class BaseListType(ListType):
127127
"""
128128
ListType with a default of an empty list.
129129
"""
130130
def __init__(self, field, **kwargs):
131-
super(ListType, self).__init__(field=field, default=[], **kwargs)
131+
super(BaseListType, self).__init__(field=field, default=[], **kwargs)
132132

133133

134134
PackageId = namedtuple('PackageId', 'type name version')
@@ -361,7 +361,7 @@ class Repository(BaseModel):
361361
label='public repository',
362362
description='A flag set to true if this is a public repository.')
363363

364-
mirror_urls = ListType(URIType)
364+
mirror_urls = BaseListType(URIType)
365365
mirror_urls.metadata = dict(
366366
label='repository mirror urls',
367367
description='A list of URLs for mirrors of this repository.')
@@ -502,7 +502,7 @@ def resolve(self):
502502
# A normalized list of version constraints for this dep. This is package-
503503
# independent and should be a normalized data structure describing all the
504504
# different version range constraints
505-
# normalized_version_constraints = ListType(StringType())
505+
# normalized_version_constraints = BaseListType(StringType())
506506
raise NotImplementedError()
507507

508508

@@ -644,42 +644,42 @@ class Package(BaseModel):
644644
description='Release date of the package')
645645

646646
# FIXME: this would be simpler as a list where each Party has also a type
647-
authors = ListType(ModelType(Party))
647+
authors = BaseListType(ModelType(Party))
648648
authors.metadata = dict(
649649
label='authors',
650650
description='A list of party objects. Note: this model schema will change soon.')
651651

652-
maintainers = ListType(ModelType(Party))
652+
maintainers = BaseListType(ModelType(Party))
653653
maintainers.metadata = dict(
654654
label='maintainers',
655655
description='A list of party objects. Note: this model schema will change soon.')
656656

657-
contributors = ListType(ModelType(Party))
657+
contributors = BaseListType(ModelType(Party))
658658
contributors.metadata = dict(
659659
label='contributors',
660660
description='A list of party objects. Note: this model schema will change soon.')
661661

662-
owners = ListType(ModelType(Party))
662+
owners = BaseListType(ModelType(Party))
663663
owners.metadata = dict(
664664
label='owners',
665665
description='A list of party objects. Note: this model schema will change soon.')
666666

667-
packagers = ListType(ModelType(Party))
667+
packagers = BaseListType(ModelType(Party))
668668
packagers.metadata = dict(
669669
label='owners',
670670
description='A list of party objects. Note: this model schema will change soon.')
671671

672-
distributors = ListType(ModelType(Party))
672+
distributors = BaseListType(ModelType(Party))
673673
distributors.metadata = dict(
674674
label='distributors',
675675
description='A list of party objects. Note: this model schema will change soon.')
676676

677-
vendors = ListType(ModelType(Party))
677+
vendors = BaseListType(ModelType(Party))
678678
vendors.metadata = dict(
679679
label='vendors',
680680
description='A list of party objects. Note: this model schema will change soon.')
681681

682-
keywords = ListType(StringType())
682+
keywords = BaseListType(StringType())
683683
keywords.metadata = dict(
684684
label='keywords',
685685
description='A list of keywords or tags.')
@@ -691,14 +691,14 @@ class Package(BaseModel):
691691
description='URL to a reference documentation for keywords or '
692692
'tags (such as a Pypi or SF.net Trove map)')
693693

694-
metafile_locations = ListType(StringType())
694+
metafile_locations = BaseListType(StringType())
695695
metafile_locations.metadata = dict(
696696
label='metafile locations',
697697
description='A list of metafile locations for this package '
698698
'(such as a package.json, a setup.py). '
699699
'Relative to the package root directory or archive root')
700700

701-
metafile_urls = ListType(URIType())
701+
metafile_urls = BaseListType(URIType())
702702
metafile_urls.metadata = dict(
703703
label='metafile URLs',
704704
description='A list of metafile remote URLs for this package '
@@ -714,7 +714,7 @@ class Package(BaseModel):
714714
label='Notes',
715715
description='Notes, free text about this package')
716716

717-
download_urls = ListType(URIType())
717+
download_urls = BaseListType(URIType())
718718
download_urls.metadata = dict(
719719
label='Download URLs',
720720
description='A list of direct download URLs, possibly in SPDX VCS url form. '
@@ -732,7 +732,7 @@ class Package(BaseModel):
732732
label='bug tracking URL',
733733
description='URL to the issue or bug tracker for this package')
734734

735-
support_contacts = ListType(StringType())
735+
support_contacts = BaseListType(StringType())
736736
support_contacts.metadata = dict(
737737
label='Support contacts',
738738
description='A list of strings (such as email, urls, etc) for support contacts')
@@ -765,18 +765,18 @@ class Package(BaseModel):
765765
label='Top level Copyright',
766766
description='a top level copyright often asserted in package metadata')
767767

768-
copyrights = ListType(StringType())
768+
copyrights = BaseListType(StringType())
769769
copyrights.metadata = dict(
770770
label='Copyrights',
771771
description='A list of effective copyrights as detected and eventually summarized')
772772

773-
asserted_licenses = ListType(ModelType(AssertedLicense))
773+
asserted_licenses = BaseListType(ModelType(AssertedLicense))
774774
asserted_licenses.metadata = dict(
775775
label='asserted licenses',
776776
description='A list of asserted license objects representing '
777777
'the asserted licensing information for this package')
778778

779-
legal_file_locations = ListType(StringType())
779+
legal_file_locations = BaseListType(StringType())
780780
legal_file_locations.metadata = dict(
781781
label='legal file locations',
782782
description='A list of paths to legal files '
@@ -788,12 +788,12 @@ class Package(BaseModel):
788788
label='license expression',
789789
description='license expression: either resolved or detected license expression')
790790

791-
license_texts = ListType(StringType())
791+
license_texts = BaseListType(StringType())
792792
license_texts.metadata = dict(
793793
label='license texts',
794794
description='A list of license texts for this package.')
795795

796-
notice_texts = ListType(StringType())
796+
notice_texts = BaseListType(StringType())
797797
license_texts.metadata = dict(
798798
label='notice texts',
799799
description='A list of notice texts for this package.')
@@ -809,7 +809,7 @@ class Package(BaseModel):
809809
'The possible values for dependency grousp are:' + ', '.join(DEPENDENCY_GROUPS)
810810
)
811811

812-
related_packages = ListType(ModelType(RelatedPackage))
812+
related_packages = BaseListType(ModelType(RelatedPackage))
813813
related_packages.metadata = dict(
814814
label='related packages',
815815
description='A list of related_package objects for this package. '

src/packagedcode/npm.py

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
2+
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
33
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
44
# The ScanCode software is licensed under the Apache License version 2.0.
55
# Data generated with ScanCode require an acknowledgment.
@@ -86,6 +86,20 @@ def parse(location):
8686
if not is_package_json(location):
8787
return
8888

89+
with codecs.open(location, encoding='utf-8') as loc:
90+
package_data = json.load(loc, object_pairs_hook=OrderedDict)
91+
92+
# a package.json is at the root of an NPM package
93+
base_dir = fileutils.parent_directory(location)
94+
metafile_name = fileutils.file_base_name(location)
95+
return build_package(package_data, base_dir, metafile_name)
96+
97+
98+
def build_package(package_data, base_dir=None, metafile_name='package.json'):
99+
"""
100+
Return a Package object from a package_data mapping (from a
101+
package.json or similar) or None.
102+
"""
89103
# mapping of top level package.json items to the Package object field name
90104
plain_fields = OrderedDict([
91105
('name', 'name'),
@@ -101,33 +115,35 @@ def parse(location):
101115
('bugs', bugs_mapper),
102116
('contributors', contributors_mapper),
103117
('maintainers', maintainers_mapper),
118+
# current form
104119
('license', licensing_mapper),
120+
# old, deprecated form
105121
('licenses', licensing_mapper),
106122
('dependencies', dependencies_mapper),
107123
('devDependencies', dev_dependencies_mapper),
108124
('peerDependencies', peer_dependencies_mapper),
109125
('optionalDependencies', optional_dependencies_mapper),
110-
('url', url_mapper),
126+
# legacy, ignored
127+
# ('url', url_mapper),
111128
('dist', dist_mapper),
112129
('repository', repository_mapper),
113130
])
114131

115-
with codecs.open(location, encoding='utf-8') as loc:
116-
data = json.load(loc, object_pairs_hook=OrderedDict)
117132

118-
if not data.get('name') or not data.get('version'):
133+
if not package_data.get('name') or not package_data.get('version'):
119134
# a package.json without name and version is not a usable NPM package
120135
return
121136

122137
package = NpmPackage()
123138
# a package.json is at the root of an NPM package
124-
base_dir = fileutils.parent_directory(location)
125139
package.location = base_dir
126140
# for now we only recognize a package.json, not a node_modules directory yet
127-
package.metafile_locations = [location]
128-
package.version = data.get('version')
141+
if metafile_name:
142+
package.metafile_locations = [metafile_name]
143+
144+
package.version = package_data.get('version') or None
129145
for source, target in plain_fields.items():
130-
value = data.get(source)
146+
value = package_data.get(source) or None
131147
if value:
132148
if isinstance(value, basestring):
133149
value = value.strip()
@@ -136,14 +152,21 @@ def parse(location):
136152

137153
for source, func in field_mappers.items():
138154
logger.debug('parse: %(source)r, %(func)r' % locals())
139-
value = data.get(source)
155+
value = package_data.get(source) or None
140156
if value:
141157
if isinstance(value, basestring):
142158
value = value.strip()
143159
if value:
144160
func(value, package)
145-
# this should be a mapper function but requires two args
146-
package.download_urls.append(public_download_url(package.name, package.version))
161+
162+
# this should be a mapper function but requires two args.
163+
# Note: we only add a synthetic download URL if there is none from
164+
# the dist mapping.
165+
if not package.download_urls:
166+
tarball = public_download_url(package.name, package.version)
167+
if tarball:
168+
package.download_urls.append(tarball)
169+
147170
return package
148171

149172

@@ -152,7 +175,7 @@ def licensing_mapper(licenses, package):
152175
Update package licensing and return package.
153176
Licensing data structure has evolved over time and is a tad messy.
154177
https://docs.npmjs.com/files/package.json#license
155-
licenses is either:
178+
license(s) is either:
156179
- a string with:
157180
- an SPDX id or expression { "license" : "(ISC OR GPL-3.0)" }
158181
- some license name or id
@@ -163,9 +186,13 @@ def licensing_mapper(licenses, package):
163186
return package
164187

165188
if isinstance(licenses, basestring):
189+
# current form
190+
# TODO: handle "SEE LICENSE IN <filename>"
191+
# TODO: parse expression with license_expression library
166192
package.asserted_licenses.append(models.AssertedLicense(license=licenses))
167193

168194
elif isinstance(licenses, dict):
195+
# old, deprecated form
169196
"""
170197
"license": {
171198
"type": "MIT",
@@ -176,6 +203,7 @@ def licensing_mapper(licenses, package):
176203
url=licenses.get('url')))
177204

178205
elif isinstance(licenses, list):
206+
# old, deprecated form
179207
"""
180208
"licenses": ["type": "Apache License, Version 2.0",
181209
"url": "http://www.apache.org/licenses/LICENSE-2.0" } ]
@@ -295,19 +323,28 @@ def repository_mapper(repo, package):
295323
if isinstance(repo, basestring):
296324
package.vcs_repository = parse_repo_url(repo)
297325
elif isinstance(repo, dict):
298-
package.vcs_tool = repo.get('type') or 'git'
299-
package.vcs_repository = parse_repo_url(repo.get('url'))
326+
repurl = parse_repo_url(repo.get('url'))
327+
if repurl:
328+
package.vcs_tool = repo.get('type') or 'git'
329+
package.vcs_repository = repurl
300330
return package
301331

302332

303333
def url_mapper(url, package):
304334
"""
305-
In a package.json, the "url" field is a redirection to a package download
306-
URL published somewhere else than on the public npm registry.
307-
We map it to a download url.
335+
In a package.json, the "url" field is a legacy field that contained
336+
various URLs either as a string or as a mapping of type->url
308337
"""
309-
if url:
310-
package.download_urls.append(url)
338+
if not url:
339+
return package
340+
341+
if isinstance(url, basestring):
342+
# TOOD: map to a miscellaneous urls dict
343+
pass
344+
elif isinstance(url, dict):
345+
# typical key is "web"
346+
# TOOD: map to a miscellaneous urls dict
347+
pass
311348
return package
312349

313350

@@ -395,6 +432,11 @@ def parse_person(person):
395432
Both forms are equivalent.
396433
"""
397434
# TODO: detect if this is a person name or a company name
435+
436+
name = None
437+
email = None
438+
url = None
439+
398440
if isinstance(person, basestring):
399441
parsed = person_parser(person)
400442
if not parsed:
@@ -409,10 +451,28 @@ def parse_person(person):
409451
name = person.get('name')
410452
email = person.get('email')
411453
url = person.get('url')
454+
412455
else:
413456
raise Exception('Incorrect NPM package.json person: %(person)r' % locals())
414457

415-
return name and name.strip(), email and email.strip('<> '), url and url.strip('() ')
458+
if name:
459+
name = name.strip()
460+
if name.lower() == 'none':
461+
name = None
462+
name = name or None
463+
464+
if email:
465+
email = email.strip('<> ')
466+
if email.lower() == 'none':
467+
email = None
468+
email = email or None
469+
470+
if url:
471+
url = url.strip('() ')
472+
if url.lower() == 'none':
473+
url = None
474+
url = url or None
475+
return name, email, url
416476

417477

418478
def public_download_url(name, version, registry='https://registry.npmjs.org'):

0 commit comments

Comments
 (0)