Skip to content

Commit 114eb75

Browse files
committed
Update the matching regex for parse_maven_filename and added test #1763
- Update format Signed-off-by: Chin Yeung Li <[email protected]>
1 parent 9812129 commit 114eb75

File tree

3 files changed

+83
-31
lines changed

3 files changed

+83
-31
lines changed

scanpipe/pipelines/scan_maven_package.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323
import json
2424

2525
from scanpipe.pipelines.scan_single_package import ScanSinglePackage
26-
27-
from scanpipe.pipes.resolve import get_pom_url_list
2826
from scanpipe.pipes.resolve import download_and_scan_pom_file
27+
from scanpipe.pipes.resolve import get_pom_url_list
2928

3029

3130
class ScanMavenPackage(ScanSinglePackage):
@@ -53,20 +52,22 @@ def steps(cls):
5352

5453
def fetch_and_scan_remote_pom(self):
5554
"""Fetch the pom.xml file from from maven.org if not present in codebase."""
56-
with open(self.scan_output_location, 'r') as file:
55+
with open(self.scan_output_location) as file:
5756
data = json.load(file)
5857
# Return and do nothing if data has pom.xml
59-
for file in data['files']:
60-
if 'pom.xml' in file['path']:
58+
for file in data["files"]:
59+
if "pom.xml" in file["path"]:
6160
return
6261
packages = data.get("packages", [])
6362

6463
pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
65-
scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list)
64+
scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(
65+
pom_url_list
66+
)
6667

6768
updated_pacakges = packages + scanned_pom_packages
6869
# Replace/Update the package and dependencies section
69-
data['packages'] = updated_pacakges
70-
data['dependencies'] = scanned_dependencies
71-
with open(self.scan_output_location, 'w') as file:
70+
data["packages"] = updated_pacakges
71+
data["dependencies"] = scanned_dependencies
72+
with open(self.scan_output_location, "w") as file:
7273
json.dump(data, file, indent=2)

scanpipe/pipes/resolve.py

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import json
2424
import logging
2525
import re
26-
import requests
2726
import sys
2827
import uuid
2928
from pathlib import Path
@@ -32,6 +31,7 @@
3231
from django.core.exceptions import ObjectDoesNotExist
3332

3433
import python_inspector.api as python_inspector
34+
import requests
3535
import saneyaml
3636
from attributecode.model import About
3737
from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
@@ -41,14 +41,13 @@
4141
from scanpipe.models import DiscoveredDependency
4242
from scanpipe.models import DiscoveredPackage
4343
from scanpipe.pipes import cyclonedx
44+
from scanpipe.pipes import fetch
4445
from scanpipe.pipes import flag
46+
from scanpipe.pipes import scancode
4547
from scanpipe.pipes import spdx
4648
from scanpipe.pipes import update_or_create_dependency
4749
from scanpipe.pipes import update_or_create_package
4850

49-
from scanpipe.pipes import fetch
50-
from scanpipe.pipes import scancode
51-
5251
"""
5352
Resolve packages from manifest, lockfile, and SBOM.
5453
"""
@@ -531,26 +530,42 @@ def extract_headers(input_location, extract_fields):
531530
def parse_maven_filename(filename):
532531
"""Parse a Maven's jar filename to extract artifactId and version."""
533532
# Remove the .jar extension
534-
base = filename.rsplit('.', 1)[0]
533+
base = filename.rsplit(".", 1)[0]
535534

536535
# Common classifiers pattern
537536
common_classifiers = {
538-
'sources', 'javadoc', 'tests', 'test', 'test-sources',
539-
'src', 'bin', 'docs', 'javadocs', 'client', 'server',
540-
'linux', 'windows', 'macos', 'linux-x86_64', 'windows-x86_64'
537+
"sources",
538+
"javadoc",
539+
"tests",
540+
"test",
541+
"test-sources",
542+
"src",
543+
"bin",
544+
"docs",
545+
"javadocs",
546+
"client",
547+
"server",
548+
"linux",
549+
"windows",
550+
"macos",
551+
"linux-x86_64",
552+
"windows-x86_64",
541553
}
542554

543555
# Remove known classifier if present
544556
for classifier in common_classifiers:
545557
if base.endswith(f"-{classifier}"):
546-
base = base[:-(len(classifier) + 1)]
558+
base = base[: -(len(classifier) + 1)]
547559
break
548560

549561
# Match artifactId and version
550-
match = re.match(r'^(.*)-(\d[\w.\-]+)$', base)
562+
match = re.match(r"^(.*?)-((\d[\w.\-]*))$", base)
563+
551564
if match:
552565
artifact_id = match.group(1)
553566
version = match.group(2)
567+
print("artifact_id", artifact_id)
568+
print("version", version)
554569
return artifact_id, version
555570
else:
556571
return None, None
@@ -564,15 +579,21 @@ def get_pom_url_list(input_source, packages):
564579
package_ns = package.get("namespace", "")
565580
package_name = package.get("name", "")
566581
package_version = package.get("version", "")
567-
pom_url = f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/{package_name}/{package_version}/{package_name}-{package_version}.pom".lower()
582+
pom_url = (
583+
f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/"
584+
f"{package_name}/{package_version}/"
585+
f"{package_name}-{package_version}.pom".lower()
586+
)
568587
pom_url_list.append(pom_url)
569588
else:
570589
# Check what's the input source
571590
input_source_url = input_source.get("download_url", "")
572591

573592
if input_source_url and "maven.org/" in input_source_url:
574-
base_url = input_source_url.rsplit('/', 1)[0]
575-
pom_url = base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
593+
base_url = input_source_url.rsplit("/", 1)[0]
594+
pom_url = (
595+
base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
596+
)
576597
pom_url_list.append(pom_url)
577598
else:
578599
# Construct a pom_url from filename
@@ -596,12 +617,12 @@ def construct_pom_url_from_filename(artifact_id, version):
596617
pom_url_list = []
597618
group_ids = []
598619
try:
599-
response = requests.get(url)
620+
response = requests.get(url, timeout=5)
600621
response.raise_for_status()
601622
data = response.json()
602623
# Extract all 'g' fields from the docs array that represent
603624
# groupIds
604-
group_ids = [doc['g'] for doc in data['response']['docs']]
625+
group_ids = [doc["g"] for doc in data["response"]["docs"]]
605626
except requests.RequestException as e:
606627
print(f"Error fetching data: {e}")
607628
return []
@@ -610,7 +631,10 @@ def construct_pom_url_from_filename(artifact_id, version):
610631
return []
611632

612633
for group_id in group_ids:
613-
pom_url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
634+
pom_url = (
635+
f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/"
636+
f"{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
637+
)
614638
if is_maven_pom_url(pom_url):
615639
pom_url_list.append(pom_url)
616640
if len(pom_url_list) > 1:
@@ -632,12 +656,12 @@ def is_maven_pom_url(url):
632656
if response.status_code != 200:
633657
return False
634658
# Check content-type
635-
content_type = response.headers.get('content-type', '').lower()
636-
is_xml = 'xml' in content_type or 'text/xml' in content_type
659+
content_type = response.headers.get("content-type", "").lower()
660+
is_xml = "xml" in content_type or "text/xml" in content_type
637661

638662
# Check content
639663
content = response.text.strip()
640-
is_pom = content.startswith('<?xml') and '<project' in content
664+
is_pom = content.startswith("<?xml") and "<project" in content
641665

642666
if is_xml and is_pom:
643667
return True
@@ -665,19 +689,19 @@ def download_and_scan_pom_file(pom_url_list):
665689
},
666690
)
667691

668-
with open(scanned_pom_output_path, 'r') as scanned_pom_file:
692+
with open(scanned_pom_output_path) as scanned_pom_file:
669693
scanned_pom_data = json.load(scanned_pom_file)
670694
scanned_packages = scanned_pom_data.get("packages", [])
671695
scanned_dependencies = scanned_pom_data.get("dependencies", [])
672696
if scanned_packages:
673697
for scanned_package in scanned_packages:
674698
# Replace the 'datafile_path' with the pom_url
675-
scanned_package['datafile_paths'] = [pom_url]
699+
scanned_package["datafile_paths"] = [pom_url]
676700
scanned_pom_packages.append(scanned_package)
677701
if scanned_dependencies:
678702
for scanned_dep in scanned_dependencies:
679703
# Replace the 'datafile_path' with empty string
680704
# See https://github.com/aboutcode-org/scancode.io/issues/1763#issuecomment-3525165830
681-
scanned_dep['datafile_path'] = ""
705+
scanned_dep["datafile_path"] = ""
682706
scanned_pom_deps.append(scanned_dep)
683707
return scanned_pom_packages, scanned_pom_deps

scanpipe/tests/pipes/test_resolve.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,30 @@ def test_scanpipe_resolve_get_manifest_headers(self):
373373
]
374374
headers = resolve.get_manifest_headers(resource)
375375
self.assertEqual(expected, list(headers.keys()))
376+
377+
def test_scanpipe_resolve_parse_maven_filename(self):
378+
test1 = "wisp-logging-2025.11.11.195957-97a44b0-sources.jar"
379+
test2 = "guava-33.5.0-jre-javadoc.jar"
380+
test3 = "junit-4.13.2.jar"
381+
test4 = "guava-33.5.0-jre.jar"
382+
383+
expected1_name = "wisp-logging"
384+
expected1_version = "2025.11.11.195957-97a44b0"
385+
expected2_name = "guava"
386+
expected2_version = "33.5.0-jre"
387+
expected3_name = "junit"
388+
expected3_version = "4.13.2"
389+
390+
result1_name, result1_version = resolve.parse_maven_filename(test1)
391+
result2_name, result2_version = resolve.parse_maven_filename(test2)
392+
result3_name, result3_version = resolve.parse_maven_filename(test3)
393+
result4_name, result4_version = resolve.parse_maven_filename(test4)
394+
395+
self.assertEqual(result1_name, expected1_name)
396+
self.assertEqual(result1_version, expected1_version)
397+
self.assertEqual(result2_name, expected2_name)
398+
self.assertEqual(result2_version, expected2_version)
399+
self.assertEqual(result3_name, expected3_name)
400+
self.assertEqual(result3_version, expected3_version)
401+
self.assertEqual(result4_name, expected2_name)
402+
self.assertEqual(result4_version, expected2_version)

0 commit comments

Comments
 (0)