Skip to content

Commit 9f8d57d

Browse files
authored
feat: 1343 populate license tables (#1445)
populate license, rules and license_rules tables with spdx data
1 parent 3273a71 commit 9f8d57d

File tree

6 files changed

+358
-0
lines changed

6 files changed

+358
-0
lines changed

functions-python/tasks_executor/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,14 @@ To populate license rules:
7878
}
7979
}
8080
```
81+
82+
To populate licenses:
83+
84+
```json
85+
{
86+
"task": "populate_licenses",
87+
"payload": {
88+
"dry_run": true
89+
}
90+
}
91+
```

functions-python/tasks_executor/src/main.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@
4444
populate_license_rules_handler,
4545
)
4646

47+
from tasks.licenses.populate_licenses import (
48+
populate_licenses_handler,
49+
)
50+
4751
init_logger()
4852
LIST_COMMAND: Final[str] = "list"
4953
tasks = {
@@ -90,6 +94,10 @@
9094
"description": "Populates license rules in the database from a predefined JSON source.",
9195
"handler": populate_license_rules_handler,
9296
},
97+
"populate_licenses": {
98+
"description": "Populates licenses and license-rules in the database from a predefined JSON source.",
99+
"handler": populate_licenses_handler,
100+
},
93101
}
94102

95103

functions-python/tasks_executor/src/tasks/licenses/populate_license_rules.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
# This script defines a task to populate the 'rules' table in the database from a canonical
2+
# JSON file. It is designed to be triggered as a background task.
3+
#
4+
# The script performs the following steps:
5+
# 1. Fetches the 'rules.json' file from the MobilityData/licenses-aas GitHub repository.
6+
# 2. The JSON file categorizes rules into 'permissions', 'conditions', and 'limitations'.
7+
# 3. It processes this structure by:
8+
# a. Iterating through each category.
9+
# b. Mapping the plural category name (e.g., 'permissions') to a singular 'type'
10+
# (e.g., 'permission') to satisfy a database check constraint on the Rule model.
11+
# c. Combining all rules from all categories into a single list.
12+
# 4. For each rule in the combined list, it performs an "upsert" operation (insert or update)
13+
# into the 'rules' table using SQLAlchemy's `merge` method, with the rule's 'name'
14+
# acting as the primary key.
15+
# 5. Supports a 'dry_run' mode, which simulates the process and logs intended actions
16+
# without committing any changes to the database.
17+
# 6. Includes error handling for network issues and database transactions.
118
import logging
219

320
import requests
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# This script defines a task to populate the 'licenses' table and the 'license_rules'
2+
# association table in the database. It is designed to be triggered as a background task.
3+
#
4+
# The script performs the following steps:
5+
# 1. Fetches a list of license definition files from the MobilityData/licenses-aas GitHub repository
6+
# using the GitHub API.
7+
# 2. Downloads the JSON content of each individual license file.
8+
# 3. For each license:
9+
# a. Parses the JSON, extracting license details primarily from the nested 'spdx' object.
10+
# This includes the license ID, name, URL, and full license text.
11+
# b. Creates a new 'License' record or updates an existing one based on the license ID.
12+
# c. Extracts the associated rule names from the 'permissions', 'conditions', and 'limitations'
13+
# lists at the top level of the JSON.
14+
# d. Queries the 'rules' table to find the corresponding Rule objects.
15+
# e. Associates the found rules with the license. The SQLAlchemy ORM automatically
16+
# manages the creation of records in the 'license_rules' join table to establish
17+
# the many-to-many relationship.
18+
# 4. Supports a 'dry_run' mode, which simulates the process and logs intended
19+
# actions without committing any changes to the database.
20+
# 5. Includes error handling for network issues and database transactions.
21+
import logging
22+
from datetime import datetime, timezone
23+
24+
import requests
25+
from shared.database.database import with_db_session
26+
from shared.database_gen.sqlacodegen_models import License, Rule
27+
28+
LICENSES_API_URL = (
29+
"https://api.github.com/repos/MobilityData/licenses-aas/contents/data/licenses"
30+
)
31+
32+
33+
def populate_licenses_handler(payload):
34+
"""
35+
Handler for populating licenses.
36+
37+
Args:
38+
payload (dict): Incoming payload data.
39+
"""
40+
dry_run = get_parameters(payload)
41+
return populate_licenses_task(dry_run)
42+
43+
44+
@with_db_session
45+
def populate_licenses_task(dry_run, db_session):
46+
"""
47+
Populates licenses and their associated rules in the database.
48+
49+
Args:
50+
dry_run (bool): If True, simulates the operation without making changes.
51+
db_session: Database session for executing queries.
52+
"""
53+
logging.info("Starting populate_licenses_task with dry_run=%s", dry_run)
54+
55+
try:
56+
logging.info("Downloading license list from %s", LICENSES_API_URL)
57+
response = requests.get(LICENSES_API_URL, timeout=10)
58+
response.raise_for_status()
59+
files = response.json()
60+
61+
licenses_data = []
62+
for file_info in files:
63+
if file_info["type"] == "file" and file_info["name"].endswith(".json"):
64+
download_url = file_info["download_url"]
65+
license_response = requests.get(download_url, timeout=10)
66+
license_response.raise_for_status()
67+
licenses_data.append(license_response.json())
68+
69+
logging.info("Loaded %d licenses.", len(licenses_data))
70+
71+
if dry_run:
72+
logging.info("Dry run: would process %d licenses.", len(licenses_data))
73+
else:
74+
for license_data in licenses_data:
75+
spdx_data = license_data.get("spdx")
76+
if not spdx_data:
77+
logging.warning("Skipping license record without 'spdx' data")
78+
continue
79+
else:
80+
is_spdx = True
81+
license_id = spdx_data.get("licenseId")
82+
if not license_id:
83+
logging.warning("Skipping record without licenseId.")
84+
continue
85+
86+
logging.info("Processing license %s", license_id)
87+
88+
license_object = db_session.get(License, license_id)
89+
if not license_object:
90+
license_object = License(id=license_id)
91+
license_object.created_at = datetime.now(timezone.utc)
92+
license_object.is_spdx = is_spdx
93+
license_object.type = "standard"
94+
# Add the new license object to the session immediately.
95+
# This makes it "pending" and allows SQLAlchemy to track relationship changes.
96+
db_session.add(license_object)
97+
license_object.name = spdx_data.get("name")
98+
license_object.updated_at = datetime.now(timezone.utc)
99+
cross_ref_list = spdx_data.get("crossRef")
100+
if (
101+
cross_ref_list
102+
and isinstance(cross_ref_list, list)
103+
and cross_ref_list
104+
):
105+
license_object.url = cross_ref_list[0].get("url")
106+
else:
107+
license_object.url = None
108+
109+
license_object.content_txt = spdx_data.get("licenseText")
110+
license_object.content_html = spdx_data.get("licenseTextHtml")
111+
112+
# Clear existing rules to handle updates
113+
license_object.rules = []
114+
115+
all_rule_names = []
116+
for rule_type in ["permissions", "conditions", "limitations"]:
117+
all_rule_names.extend(license_data.get(rule_type, []))
118+
119+
if all_rule_names:
120+
rules = (
121+
db_session.query(Rule)
122+
.filter(Rule.name.in_(all_rule_names))
123+
.all()
124+
)
125+
license_object.rules.extend(rules)
126+
if len(rules) != len(all_rule_names):
127+
logging.warning(
128+
"License '%s': Found %d of %d rules in the database.",
129+
license_id,
130+
len(rules),
131+
len(all_rule_names),
132+
)
133+
# Merge the license object into the session. This handles both creating new licenses
134+
# and updating existing ones (upsert), including their rule associations.
135+
db_session.merge(license_object)
136+
137+
logging.info(
138+
"Successfully upserted licenses into the database.",
139+
)
140+
141+
except requests.exceptions.RequestException as e:
142+
logging.error("Failed to download licenses JSON file: %s", e)
143+
raise
144+
145+
146+
def get_parameters(payload):
147+
"""
148+
Get parameters from the payload.
149+
150+
Args:
151+
payload (dict): dictionary containing the payload data.
152+
Returns:
153+
bool: dry_run
154+
"""
155+
return payload.get("dry_run", False)

functions-python/tasks_executor/tests/tasks/populate_license_rules/test_populate_license_rules.py renamed to functions-python/tasks_executor/tests/tasks/populate_licenses_and_rules/test_populate_license_rules.py

File renamed without changes.
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
4+
import requests
5+
6+
from shared.database_gen.sqlacodegen_models import Rule
7+
from tasks.licenses.populate_licenses import (
8+
LICENSES_API_URL,
9+
populate_licenses_task,
10+
)
11+
12+
# Mock data for GitHub API responses
13+
MOCK_LICENSE_LIST = [
14+
{
15+
"name": "MIT.json",
16+
"type": "file",
17+
"download_url": "http://mockurl/MIT.json",
18+
},
19+
{
20+
"name": "BSD-3-Clause.json",
21+
"type": "file",
22+
"download_url": "http://mockurl/BSD-3-Clause.json",
23+
},
24+
{
25+
"name": "no-spdx.json",
26+
"type": "file",
27+
"download_url": "http://mockurl/no-spdx.json",
28+
},
29+
{
30+
"name": "README.md",
31+
"type": "file",
32+
"download_url": "http://mockurl/README.md",
33+
},
34+
]
35+
36+
MOCK_LICENSE_MIT = {
37+
"spdx": {
38+
"licenseId": "MIT",
39+
"name": "MIT License",
40+
"crossRef": [{"url": "https://opensource.org/licenses/MIT"}],
41+
"licenseText": "MIT License text...",
42+
"licenseTextHtml": "<p>MIT License text...</p>",
43+
},
44+
"permissions": ["commercial-use", "distribution"],
45+
"conditions": ["include-copyright"],
46+
"limitations": [],
47+
}
48+
49+
MOCK_LICENSE_BSD = {
50+
"spdx": {
51+
"licenseId": "BSD-3-Clause",
52+
"name": "BSD 3-Clause License",
53+
"crossRef": [{"url": "https://opensource.org/licenses/BSD-3-Clause"}],
54+
"licenseText": "BSD license text...",
55+
"licenseTextHtml": "<p>BSD license text...</p>",
56+
},
57+
"permissions": ["commercial-use"],
58+
"conditions": [],
59+
"limitations": ["liability", "warranty"],
60+
}
61+
62+
MOCK_LICENSE_NO_SPDX = {"licenseId": "NO-SPDX-ID", "name": "No SPDX License"}
63+
64+
65+
class TestPopulateLicenses(unittest.TestCase):
66+
def _mock_requests_get(self, mock_get):
67+
"""Helper to configure mock for requests.get."""
68+
mock_responses = {
69+
LICENSES_API_URL: MagicMock(json=lambda: MOCK_LICENSE_LIST),
70+
"http://mockurl/MIT.json": MagicMock(json=lambda: MOCK_LICENSE_MIT),
71+
"http://mockurl/BSD-3-Clause.json": MagicMock(
72+
json=lambda: MOCK_LICENSE_BSD
73+
),
74+
"http://mockurl/no-spdx.json": MagicMock(json=lambda: MOCK_LICENSE_NO_SPDX),
75+
}
76+
77+
def get_side_effect(url, timeout=None):
78+
if url in mock_responses:
79+
response = mock_responses[url]
80+
response.raise_for_status.return_value = None
81+
return response
82+
raise requests.exceptions.RequestException(f"URL not mocked: {url}")
83+
84+
mock_get.side_effect = get_side_effect
85+
86+
@patch("tasks.licenses.populate_licenses.requests.get")
87+
def test_populate_licenses_success(self, mock_get):
88+
"""Test successful population of licenses."""
89+
# Arrange
90+
self._mock_requests_get(mock_get)
91+
mock_db_session = MagicMock()
92+
mock_db_session.get.return_value = None # Simulate no existing licenses
93+
94+
# Mock the rules query to return only the rules that are requested.
95+
all_mock_rules = {
96+
"commercial-use": Rule(name="commercial-use"),
97+
"distribution": Rule(name="distribution"),
98+
"include-copyright": Rule(name="include-copyright"),
99+
"liability": Rule(name="liability"),
100+
"warranty": Rule(name="warranty"),
101+
}
102+
103+
def filter_side_effect(filter_condition):
104+
# This simulates the `Rule.name.in_(...)` filter by inspecting the
105+
# requested names from the filter condition's right-hand side.
106+
requested_names = filter_condition.right.value
107+
mock_query_result = [
108+
all_mock_rules[name]
109+
for name in requested_names
110+
if name in all_mock_rules
111+
]
112+
mock_filter = MagicMock()
113+
mock_filter.all.return_value = mock_query_result
114+
return mock_filter
115+
116+
mock_db_session.query.return_value.filter.side_effect = filter_side_effect
117+
118+
# Act
119+
populate_licenses_task(dry_run=False, db_session=mock_db_session)
120+
121+
# Assert
122+
self.assertEqual(mock_db_session.merge.call_count, 2)
123+
mock_db_session.rollback.assert_not_called()
124+
125+
# Check that merge was called with correctly constructed License objects
126+
call_args_list = mock_db_session.merge.call_args_list
127+
merged_licenses = [arg.args[0] for arg in call_args_list]
128+
129+
mit_license = next((lic for lic in merged_licenses if lic.id == "MIT"), None)
130+
self.assertIsNotNone(mit_license)
131+
self.assertEqual(mit_license.name, "MIT License")
132+
self.assertTrue(mit_license.is_spdx)
133+
self.assertEqual(len(mit_license.rules), 3)
134+
135+
@patch("tasks.licenses.populate_licenses.requests.get")
136+
def test_populate_licenses_dry_run(self, mock_get):
137+
"""Test that no database changes are made during a dry run."""
138+
# Arrange
139+
self._mock_requests_get(mock_get)
140+
mock_db_session = MagicMock()
141+
142+
# Act
143+
populate_licenses_task(dry_run=True, db_session=mock_db_session)
144+
145+
# Assert
146+
mock_db_session.get.assert_not_called()
147+
mock_db_session.merge.assert_not_called()
148+
mock_db_session.rollback.assert_not_called()
149+
150+
@patch("tasks.licenses.populate_licenses.requests.get")
151+
def test_request_exception_handling(self, mock_get):
152+
"""Test handling of a requests exception."""
153+
# Arrange
154+
mock_get.side_effect = requests.exceptions.RequestException("Network Error")
155+
mock_db_session = MagicMock()
156+
157+
# Act & Assert
158+
with self.assertRaises(requests.exceptions.RequestException):
159+
populate_licenses_task(dry_run=False, db_session=mock_db_session)
160+
161+
mock_db_session.merge.assert_not_called()
162+
# Rollback is not called because the exception happens before the db try/except block
163+
mock_db_session.rollback.assert_not_called()
164+
165+
166+
if __name__ == "__main__":
167+
unittest.main()

0 commit comments

Comments
 (0)