Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b12150b
Create minecode_pipeline module and move relevent code over #660
JonoYang Aug 25, 2025
e4fdae9
Move maven crawler code into new module #660
JonoYang Aug 26, 2025
8f1f9ac
Yield packages from index, collect, and write to file #660
JonoYang Aug 28, 2025
837d0d1
Push commits at regular intervals
JonoYang Aug 28, 2025
b92a8bd
Use new branch of scio #660
JonoYang Sep 1, 2025
6b3372d
Create write_purls_to_repo #660
JonoYang Sep 1, 2025
4ff859b
Merge branch 'main' into 660-purl-next-maven
JonoYang Sep 3, 2025
44f4363
Merge branch 'main' into 660-purl-next-maven
JonoYang Sep 4, 2025
b0a3d6e
Clone repo per package #660
JonoYang Sep 4, 2025
d9c8b8d
Add methods to fetch maven index properties and increments #660
JonoYang Sep 5, 2025
ac7f6d8
Merge branch 'main' into 660-purl-next-maven
JonoYang Sep 6, 2025
1bfb2d8
Move files to minecode_pipelines #660
JonoYang Sep 6, 2025
dda2bca
Move tests over to minecode_pipelines #660
JonoYang Sep 6, 2025
74bd21c
Create purls.yml path without leading segment #660
JonoYang Sep 8, 2025
810956f
Use last incremental from index properties #660
JonoYang Sep 11, 2025
0d441f9
Move code to proper place #660
JonoYang Sep 13, 2025
e5ed41a
Merge branch 'main' into 660-purl-next-maven
JonoYang Sep 15, 2025
eb4a4fb
Use new common functions for writing and commiting purls #660
JonoYang Sep 16, 2025
289c336
Handle downloading and deleting index archive using constructor/destr…
JonoYang Sep 18, 2025
88956bf
Split repo deletion into separate pipeline step #660
JonoYang Sep 18, 2025
9449ee2
Fix logic in destructor #660
JonoYang Sep 19, 2025
a35a351
Merge branch 'main' into 660-purl-next-maven
JonoYang Sep 19, 2025
81b1243
Declare mine_maven pipeline in pyproject #660
JonoYang Sep 19, 2025
a225502
Remove repo clone step #660
JonoYang Sep 19, 2025
d126ea0
Update code style #660
JonoYang Sep 19, 2025
113bce5
Revert change to scio dep version #660
JonoYang Sep 19, 2025
78045ef
Remove unnecessary logger #660
JonoYang Sep 19, 2025
644fbc0
Clean up all downloads in destructor #660
JonoYang Sep 20, 2025
77295a3
Update docstrings #660
JonoYang Sep 20, 2025
bc240f2
Simplify download tracking #660
JonoYang Sep 23, 2025
fabe276
Update MinePypi pipeline name
JonoYang Sep 23, 2025
0b36220
Update pipeline #660
JonoYang Sep 23, 2025
16adc5c
Update expected test result
JonoYang Sep 23, 2025
ac3032a
Update package batch size #660
JonoYang Sep 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions minecode_pipeline/pipelines/mine_maven.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.


from minecode_pipeline.pipes import maven
from scanpipe.pipelines.publish_to_federatedcode import PublishToFederatedCode


class MineMaven(PublishToFederatedCode):
"""
Create DiscoveredPackages for packages found on maven:
- input: url of maven repo
- process index
- collect purls, grouped by package
- write to files
- publish to fetchcode
- loop

"""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.collect_packages_from_maven,
)

def collect_packages_from_maven(self):
maven.collect_packages_from_maven(self.project, self.log)
30 changes: 28 additions & 2 deletions minecode_pipeline/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@

from pathlib import Path

from aboutcode.hashid import PURLS_FILENAME
from aboutcode import hashid
from scanpipe.pipes import federatedcode


def write_packageurls_to_file(repo, base_dir, packageurls):
purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME)
purl_file_rel_path = os.path.join(base_dir, hashid.PURLS_FILENAME)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JonoYang @AyanSinhaMahapatra I think we should avoid computing the package URL path and just use this function. ppath = hashid.get_package_purls_yml_file_path(package)

purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path
write_data_to_file(path=purl_file_full_path, data=packageurls)
return purl_file_rel_path
Expand All @@ -26,3 +27,28 @@ def write_data_to_file(path, data):
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, encoding="utf-8", mode="w") as f:
f.write(saneyaml.dump(data))


def write_purls_to_repo(repo, package, packages, push_commit=False):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AyanSinhaMahapatra I've moved your write_packageurls_to_file and write_purls_to_repo functions to this file and I see that we have overlapping functionality here, especially with the code in https://github.com/aboutcode-org/purldb/blob/main/minecode_pipeline/pipes/pypi.py#L135

I think we should use a common function to do these actions

# save purls to yaml
ppath = hashid.get_package_purls_yml_file_path(package)
purls = [p.purl for p in packages]
federatedcode.write_data_as_yaml(
base_path=repo.working_dir,
file_path=ppath,
data=purls,
)

change_type = "Add" if ppath in repo.untracked_files else "Update"
commit_message = f"""\
{change_type} list of available {package} versions
"""
federatedcode.commit_changes(
repo=repo,
files_to_commit=[ppath],
commit_message=commit_message,
)

# see if we should push
if push_commit:
federatedcode.push_changes(repo=repo)
51 changes: 51 additions & 0 deletions minecode_pipeline/pipes/java_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# The MIT License (MIT)
#
# Copyright (c) 2014 Gustav Arngården
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


"""Reading from Java DataInputStream format."""

import struct


class DataInputStream:
def __init__(self, stream):
self.stream = stream

def read(self, n=1):
data = self.stream.read(n)
if len(data) != n:
# this is a problem but in most cases we have reached EOF
raise EOFError
return data

def read_byte(self):
return struct.unpack("b", self.read(1))[0]

def read_long(self):
return struct.unpack(">q", self.read(8))[0]

def read_utf(self):
utf_length = struct.unpack(">H", self.read(2))[0]
return self.read(utf_length)

def read_int(self):
return struct.unpack(">i", self.read(4))[0]
14 changes: 14 additions & 0 deletions minecode_pipeline/pipes/java_stream.py.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
about_resource: java_stream.py
name: java_stream.py
version: 7d118ceef9746981e6bc198861125ca2bb6f920f
homepage_url: https://github.com/arngarden/python_java_datastream
owner: Gustav Arngården
copyright: Copyright (c) 2014 Gustav Arngården
download_url: https://raw.githubusercontent.com/arngarden/python_java_datastream/7d118ceef9746981e6bc198861125ca2bb6f920f/data_input_stream.py
license_text_file: license_expfession: mit
licenses:
- key: mit
file: java_stream.py.LICENSE

vcs_tool: git
vcs_repo: https://github.com/arngarden/python_java_datastream
21 changes: 21 additions & 0 deletions minecode_pipeline/pipes/java_stream.py.LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2014 Gustav Arngården

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Loading
Loading