Skip to content

Commit 754e6f9

Browse files
authored
Merge pull request #33 from umago/okp
Add OKP module
2 parents fa72d05 + 8d29bf9 commit 754e6f9

File tree

3 files changed

+345
-0
lines changed

3 files changed

+345
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ coverage.xml
5050
.hypothesis/
5151
.pytest_cache/
5252
cover/
53+
tests/test_results
5354

5455
# Translations
5556
*.mo

src/lightspeed_rag_content/okp.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# Copyright 2025 Red Hat, Inc.
2+
# All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
5+
# not use this file except in compliance with the License. You may obtain
6+
# a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13+
# License for the specific language governing permissions and limitations
14+
# under the License.
15+
"""Utility methods processing OKP files."""
16+
17+
import logging
18+
import re
19+
import tomllib
20+
from pathlib import Path
21+
from typing import Any, Generator
22+
23+
from lightspeed_rag_content.metadata_processor import MetadataProcessor
24+
25+
LOG = logging.getLogger(__name__)
26+
27+
28+
def is_file_related_to_projects(metadata: dict[str, Any], projects: list[str]) -> bool:
29+
"""Check if the OKP file is related to specific projects.
30+
31+
This function checks if the metadata of an OKP file indicates that it is
32+
related to any of the specified projects. It does this by looking for
33+
project names in the `portal_product_names` field of the metadata.
34+
35+
Matching is done by substring because product names can be more specific.
36+
For example, "Red Hat OpenStack" or "Red Hat OpenStack Platform" both
37+
should match simply "OpenStack".
38+
39+
Args:
40+
metadata (dict[str, Any]): The metadata dictionary of the OKP file.
41+
projects (list[str]): A list of project names to check against.
42+
43+
Returns:
44+
bool: True if the file is related to any of the specified projects,
45+
False otherwise.
46+
47+
"""
48+
product_names = metadata.get("extra", {}).get("portal_product_names", [])
49+
# Lowercase both lists
50+
product_names = [p.lower() for p in product_names]
51+
projects = [p.lower() for p in projects]
52+
return any(p in pn for p in projects for pn in product_names)
53+
54+
55+
def metadata_has_url_and_title(metadata: dict[str, Any]) -> bool:
56+
"""Check if the metadata contains the URL and title.
57+
58+
Args:
59+
metadata (dict[str, Any]): The metadata dictionary to check.
60+
61+
Returns:
62+
bool: True if both URL and title are present, False otherwise.
63+
"""
64+
return (
65+
"reference_url" in metadata.get("extra", {})
66+
and metadata.get("title", "").strip()
67+
)
68+
69+
70+
def yield_files_related_to_projects(
71+
directory: str, projects: list[str]
72+
) -> Generator[Path, None, None]:
73+
"""Yield OKP files paths in a given directory for specific projects.
74+
75+
This method scans the specified directory for OKP files,
76+
extracts their metadata, and yields the file paths if they are related
77+
to the specified projects and contain both a URL and a title.
78+
79+
For example, the errata/ directory contains files with metadata
80+
that can be used to determine if the file is related to a specific project.
81+
The metadata is expected to have a structure similar to:
82+
{
83+
"title": "Example Title",
84+
"extra": {
85+
"reference_url": "https://example.com",
86+
"portal_product_names": ["Project Foo", "Project Bar"]
87+
}
88+
}
89+
90+
Args:
91+
directory (str): The directory to scan for OKP files.
92+
projects (list[str]): A list of project names to filter the files.
93+
94+
Yields:
95+
Path: The path to the OKP file if it is related to the specified projects.
96+
"""
97+
for filepath in Path(directory).glob("*.md"):
98+
try:
99+
metadata = parse_metadata(str(filepath))
100+
if is_file_related_to_projects(metadata, projects):
101+
if metadata_has_url_and_title(metadata):
102+
yield filepath
103+
else:
104+
LOG.warning(
105+
"Skipping OKP file %s: does not have URL and/or title.",
106+
filepath,
107+
)
108+
except ValueError as e:
109+
LOG.warning("Skipping OKP file %s: %s", filepath, e)
110+
111+
112+
def parse_metadata(filepath: str) -> dict[str, Any]:
113+
"""Extract metadata from the OKP file.
114+
115+
This function reads the content of the OKP file, extracts the metadata block
116+
enclosed by `+++` markers, and parses it as TOML. It returns the metadata as a
117+
dictionary.
118+
119+
Args:
120+
filepath (str): The path to the OKP file.
121+
122+
Returns:
123+
dict[str, Any]: The parsed metadata from the OKP file.
124+
125+
Raises:
126+
ValueError: If no metadata block is found in the file.
127+
128+
"""
129+
with open(filepath, "rb") as f:
130+
content = f.read()
131+
132+
# Extract everything between the +++ markers
133+
match = re.search(rb"\+{3,}\s*(.*?)\s*\+{3,}", content, re.S)
134+
if not match:
135+
raise ValueError(f"No metadata found in {filepath}")
136+
137+
metadata_block = match.group(1)
138+
return tomllib.loads(metadata_block.decode("utf-8"))
139+
140+
141+
class OKPMetadataProcessor(MetadataProcessor):
142+
"""Metadata processor for OKP files."""
143+
144+
def url_function(self, file_path: str) -> str:
145+
"""Return the URL for the OKP file."""
146+
md = parse_metadata(file_path)
147+
return md["extra"]["reference_url"]
148+
149+
def get_file_title(self, file_path: str) -> str:
150+
"""Return the title of the OKP file."""
151+
md = parse_metadata(file_path)
152+
return md["title"]

tests/test_okp.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# Copyright 2025 Red Hat, Inc.
2+
# All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
5+
# not use this file except in compliance with the License. You may obtain
6+
# a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13+
# License for the specific language governing permissions and limitations
14+
# under the License.
15+
16+
import unittest
17+
from unittest import mock
18+
19+
from lightspeed_rag_content import okp
20+
21+
22+
class TestOKP(unittest.TestCase):
23+
"""Test cases for OKP utility methods."""
24+
25+
def test_metadata_has_url_and_title(self):
26+
"""Test that the metadata has both URL and title."""
27+
md = {
28+
"title": "Example Title",
29+
"extra": {
30+
"reference_url": "https://fake.url/for/example",
31+
},
32+
}
33+
self.assertTrue(okp.metadata_has_url_and_title(md))
34+
35+
def test_metadata_has_url_and_title_false(self):
36+
"""Test that the metadata has both URL and title."""
37+
# No URL
38+
md = {
39+
"title": "Example Title",
40+
"extra": {},
41+
}
42+
self.assertFalse(okp.metadata_has_url_and_title(md))
43+
44+
# No title
45+
md = {
46+
"extra": {
47+
"reference_url": "https://fake.url/for/example",
48+
},
49+
}
50+
self.assertFalse(okp.metadata_has_url_and_title(md))
51+
52+
def test_is_file_related_to_projects(self):
53+
"""Test if the file is related to specific projects."""
54+
metadata = {
55+
"extra": {
56+
"portal_product_names": ["Project Foo", "Project Bar"],
57+
},
58+
}
59+
projects = ["foo", "bar"]
60+
self.assertTrue(okp.is_file_related_to_projects(metadata, projects))
61+
62+
projects = ["spongebob"]
63+
self.assertFalse(okp.is_file_related_to_projects(metadata, projects))
64+
65+
def test_parse_metadata(self):
66+
"""Test parsing metadata from a OKP file."""
67+
content = b"""
68+
+++
69+
title = '''Example Title'''
70+
path = "/errata/FAKE-1234"
71+
template = "erratum.html"
72+
[extra]
73+
document_kind="errata"
74+
original_title='''FAKE-1234 - Bugs in fake project'''
75+
solr_index="true"
76+
modified="2003-02-05T00:00:00Z"
77+
issued="2003-02-06T00:00:00Z"
78+
id="FAKE-1234"
79+
reference_url="https://fake.url/for/example"
80+
view_uri="/errata/FAKE-1234"
81+
portal_advisory_type="Bug Fix Advisory"
82+
portal_synopsis='''Bugs in fake project'''
83+
portal_severity="None"
84+
portal_product_names=["Product Foo","Product Bar"]
85+
portal_product_filter=["Product Foo|Product Bar|2|ia64","Product Foo|Product Bar|2|ia64"]
86+
+++
87+
"""
88+
89+
m = mock.mock_open(read_data=content)
90+
with mock.patch("builtins.open", m):
91+
metadata = okp.parse_metadata("fake_file.md")
92+
93+
# Check if the metadata is parsed correctly
94+
expected_metadata = {
95+
"title": "Example Title",
96+
"path": "/errata/FAKE-1234",
97+
"template": "erratum.html",
98+
"extra": {
99+
"document_kind": "errata",
100+
"original_title": "FAKE-1234 - Bugs in fake project",
101+
"solr_index": "true",
102+
"modified": "2003-02-05T00:00:00Z",
103+
"issued": "2003-02-06T00:00:00Z",
104+
"id": "FAKE-1234",
105+
"reference_url": "https://fake.url/for/example",
106+
"view_uri": "/errata/FAKE-1234",
107+
"portal_advisory_type": "Bug Fix Advisory",
108+
"portal_synopsis": "Bugs in fake project",
109+
"portal_severity": "None",
110+
"portal_product_names": ["Product Foo", "Product Bar"],
111+
"portal_product_filter": [
112+
"Product Foo|Product Bar|2|ia64",
113+
"Product Foo|Product Bar|2|ia64",
114+
],
115+
},
116+
}
117+
self.assertEqual(metadata, expected_metadata)
118+
119+
@mock.patch("lightspeed_rag_content.okp.Path.glob")
120+
def test_yield_files_related_to_projects(self, mock_glob):
121+
"""Test yielding files related to specific projects."""
122+
mock_glob.return_value = [
123+
"file1.md",
124+
"file2.md",
125+
"file3.md", # Should be ignored, missing metadata
126+
]
127+
128+
okp.parse_metadata = mock.MagicMock(
129+
side_effect=[
130+
{
131+
"title": "File 1",
132+
"extra": {
133+
"reference_url": "https://example.com/file1",
134+
"portal_product_names": ["Project Foo"],
135+
},
136+
},
137+
{
138+
"title": "File 2",
139+
"extra": {
140+
"reference_url": "https://example.com/file2",
141+
"portal_product_names": ["Project Bar"],
142+
},
143+
},
144+
{
145+
"title": "File 3",
146+
"extra": {
147+
"portal_product_names": ["Project Baz"],
148+
},
149+
},
150+
]
151+
)
152+
153+
projects = ["foo", "bar"]
154+
files = list(okp.yield_files_related_to_projects("/fake", projects))
155+
156+
# Check that the correct files are yielded
157+
self.assertEqual(len(files), 2)
158+
self.assertIn("file1.md", files)
159+
self.assertIn("file2.md", files)
160+
161+
# Check that parse_metadata was called with the correct file paths
162+
okp.parse_metadata.assert_any_call("file1.md")
163+
okp.parse_metadata.assert_any_call("file2.md")
164+
165+
166+
@mock.patch(
167+
"lightspeed_rag_content.okp.parse_metadata",
168+
return_value={
169+
"title": "Test Title",
170+
"extra": {
171+
"reference_url": "https://example.com",
172+
},
173+
},
174+
)
175+
class TestOKPMetadataProcessor(unittest.TestCase):
176+
"""Test cases for OKPMetadataProcessor class."""
177+
178+
def setUp(self):
179+
"""Set up the test case."""
180+
self.okp_mp = okp.OKPMetadataProcessor()
181+
182+
def test_url_function(self, mock_parse_metadata):
183+
"""Test the URL function of OKPMetadataProcessor."""
184+
file_path = "/fake/path/errata_file.md"
185+
expected_url = "https://example.com"
186+
self.assertEqual(self.okp_mp.url_function(file_path), expected_url)
187+
188+
def test_get_file_title(self, mock_parse_metadata):
189+
"""Test the get_file_title function of OKPMetadataProcessor."""
190+
file_path = "/fake/path/errata_file.md"
191+
expected_title = "Test Title"
192+
self.assertEqual(self.okp_mp.get_file_title(file_path), expected_title)

0 commit comments

Comments
 (0)