Skip to content

Commit 98b6e69

Browse files
committed
Generalize product docs fetching from OpenShift to any product(s)
1 parent 5d79087 commit 98b6e69

File tree

7 files changed

+479
-1289
lines changed

7 files changed

+479
-1289
lines changed

scripts/html_embeddings/README.md

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,47 @@ Standard:
2828
```bash
2929
# Generate embeddings for OpenShift 4.18
3030
python scripts/html_embeddings/generate_embeddings.py \
31-
--version 4.18 \
31+
--doc-url-slug "openshift_container_platform" \
32+
--doc-url-version "4.18" \
3233
--output-dir ./vector_db \
3334
--model-dir ./embeddings_model
3435
```
3536

37+
Specify a full custom URL:
38+
39+
```bash
40+
# Generate embeddings for RHEL 10
41+
python scripts/html_embeddings/generate_embeddings.py \
42+
--doc-url "https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/10" \
43+
--output-dir ./vector_db \
44+
--model-dir ./embeddings_model
45+
```
46+
47+
Use a configuration file:
48+
49+
```bash
50+
# Generate embeddings for multiple products
51+
python scripts/html_embeddings/generate_embeddings.py \
52+
--config-file ./docs_config.yaml
53+
```
54+
55+
Example `docs_config.yaml`:
56+
```yaml
57+
products:
58+
- slug: "openshift_container_platform"
59+
version: "4.18"
60+
- slug: "red_hat_enterprise_linux"
61+
version: "9"
62+
- url: "https://access.redhat.com/documentation/en-us/red_hat_ansible_automation_platform/2.4/html-single/red_hat_ansible_automation_platform_installation_guide"
63+
```
64+
3665
Specify custom index name instead of the auto-generated one:
3766
3867
```bash
3968
# Generate embeddings for OpenShift 4.18
4069
python scripts/html_embeddings/generate_embeddings.py \
41-
--version 4.18 \
70+
--doc-url-slug "openshift_container_platform" \
71+
--doc-url-version "4.18" \
4272
--output-dir ./vector_db \
4373
--index ocp-4.18 \
4474
--model-dir ./embeddings_model
@@ -47,9 +77,10 @@ python scripts/html_embeddings/generate_embeddings.py \
4777
Process only specific document and skip runbooks (good for quick testing):
4878

4979
```bash
50-
# Process only monitoring documentation
80+
# Process only observability documentation
5181
python scripts/html_embeddings/generate_embeddings.py \
52-
--version 4.18 \
82+
--doc-url-slug "openshift_container_platform" \
83+
--doc-url-version "4.18" \
5384
--specific-doc observability_overview \
5485
--output-dir ./vector_db \
5586
--model-dir ./embeddings_model \
@@ -61,7 +92,8 @@ Use cached downloads:
6192
```bash
6293
# Use previously downloaded files
6394
python scripts/html_embeddings/generate_embeddings.py \
64-
--version 4.18 \
95+
--doc-url-slug "openshift_container_platform" \
96+
--doc-url-version "4.18" \
6597
--use-cached-downloads \
6698
--output-dir ./vector_db \
6799
--model-dir ./embeddings_model
@@ -72,7 +104,8 @@ Set a custom token limit (default is the same 380 as in Markdown-based chunking)
72104
```bash
73105
# Set the token limit
74106
python generate_embeddings.py \
75-
--version 4.18 \
107+
--doc-url-slug "openshift_container_platform" \
108+
--doc-url-version "4.18" \
76109
--chunk 380 \
77110
--output-dir ./vector_db \
78111
--model-dir ./embeddings_model
@@ -82,7 +115,10 @@ python generate_embeddings.py \
82115

83116
### Main arguments
84117

85-
- `--version` - OpenShift version (required, e.g., "4.18")
118+
- `--doc-url`: The full URL to the documentation's html-single page.
119+
- `--doc-url-slug`: The product's documentation slug (e.g., 'openshift_container_platform').
120+
- `--doc-url-version`: The product version. Defaults to 'latest'. Used with --doc-url-slug.
121+
- `--config-file`: Path to a YAML or JSON configuration file specifying products to process.
86122
- `--index` - Index name (optional, e.g., "ocp-4.18")
87123
- `--output-dir` - Vector DB output directory (default: "./vector_db")
88124
- `--model-dir` - Embedding model directory (default: "./embeddings_model")

scripts/html_embeddings/chunk_html.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
def chunk_html_documents(
2121
input_dir: Path,
2222
output_dir: Path,
23+
product_slug: str,
24+
product_version: str,
25+
doc_url: Optional[str] = None,
2326
max_token_limit: int = 380,
2427
count_tag_tokens: bool = True,
2528
keep_siblings_together: bool = True,
@@ -31,6 +34,9 @@ def chunk_html_documents(
3134
Args:
3235
input_dir: Directory containing stripped HTML files
3336
output_dir: Directory to save chunked content
37+
product_slug: Product URL slug
38+
product_version: Product version
39+
doc_url: The full URL to the documentation.
3440
max_token_limit: Maximum tokens per chunk
3541
count_tag_tokens: Whether to count HTML tags in token count
3642
keep_siblings_together: Keep sibling sections together when possible
@@ -75,22 +81,24 @@ def chunk_html_documents(
7581
logger.debug("Processing %s", html_file)
7682

7783
# The doc name is the parent directory of the html file.
78-
# The version is the parent of that directory.
7984
doc_name = html_file.parent.name
80-
version = html_file.parent.parent.name
8185

8286
# The main output_dir is the version dir, e.g., '.../chunks/4.18'.
8387
# We create the doc-specific subdirectory here.
8488
doc_specific_output_dir = output_dir / doc_name
8589

8690
# Construct the source URL, which will be passed to the chunker.
87-
source_url = f"https://docs.redhat.com/en/documentation/openshift_container_platform/{version}/html-single/{doc_name}/"
91+
if doc_url:
92+
source_url = doc_url
93+
else:
94+
source_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{product_version}/html-single/{doc_name}/"
8895

8996
success, chunk_count = chunk_single_html_file(
9097
input_file=html_file.resolve(),
91-
output_dir=doc_specific_output_dir, # Pass the new doc-specific dir
92-
input_base_dir=base_dir_for_relative_paths.resolve(), # Pass the consistent version-level base path
98+
output_dir=doc_specific_output_dir,
99+
input_base_dir=base_dir_for_relative_paths.resolve(),
93100
source_url=source_url,
101+
product_slug=product_slug,
94102
max_token_limit=max_token_limit,
95103
count_tag_tokens=count_tag_tokens,
96104
keep_siblings_together=keep_siblings_together,
@@ -133,7 +141,8 @@ def chunk_single_html_file(
133141
input_file: Path,
134142
output_dir: Path,
135143
input_base_dir: Path,
136-
source_url: str, # Add source_url parameter
144+
source_url: str,
145+
product_slug: str,
137146
max_token_limit: int = 380,
138147
count_tag_tokens: bool = True,
139148
keep_siblings_together: bool = True,
@@ -147,6 +156,7 @@ def chunk_single_html_file(
147156
output_dir: Directory to save chunks
148157
input_base_dir: Base directory for input files (for relative path calculation)
149158
source_url: The public URL of the source document
159+
product_slug: Product URL slug
150160
max_token_limit: Maximum tokens per chunk
151161
count_tag_tokens: Whether to count HTML tags
152162
keep_siblings_together: Keep sibling sections together
@@ -179,7 +189,7 @@ def chunk_single_html_file(
179189
return True, 0
180190

181191
relative_path = input_file.relative_to(input_base_dir)
182-
base_metadata = extract_metadata_from_path(relative_path)
192+
base_metadata = extract_metadata_from_path(relative_path, product_slug)
183193

184194
chunk_count = 0
185195
for i, chunk_obj in enumerate(chunks):
@@ -221,12 +231,13 @@ def chunk_single_html_file(
221231
return False, 0
222232

223233

224-
def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
234+
def extract_metadata_from_path(file_path: Path, product_slug: str) -> Dict[str, Any]:
225235
"""
226236
Extract metadata from file path.
227237
228238
Args:
229239
file_path: Relative path to the file
240+
product_slug: Product URL slug
230241
231242
Returns:
232243
Dictionary with extracted metadata
@@ -253,7 +264,7 @@ def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
253264
"doc_id": doc_id,
254265
"version": version,
255266
"file_path": str(file_path),
256-
"doc_type": "openshift_documentation",
267+
"doc_type": f"{product_slug}_documentation",
257268
}
258269

259270

scripts/html_embeddings/download_docs.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
def download_documentation(
2323
version: str,
24+
product_slug: str,
25+
doc_url: Optional[str] = None,
2426
specific_doc: Optional[str] = None,
2527
output_dir: Path = Path("./downloads"),
2628
cache_existing: bool = True,
@@ -29,10 +31,12 @@ def download_documentation(
2931
fail_on_error: bool = False,
3032
) -> bool:
3133
"""
32-
Download OpenShift documentation.
34+
Download documentation.
3335
3436
Args:
35-
version: OpenShift version (e.g., "4.18")
37+
version: Product version (e.g., "4.18")
38+
product_slug: Product URL slug (e.g., "openshift_container_platform")
39+
doc_url: The full URL to the documentation page.
3640
specific_doc: Optional specific document to download
3741
output_dir: Directory to save downloaded files
3842
cache_existing: Whether to use cached downloads
@@ -45,10 +49,12 @@ def download_documentation(
4549
"""
4650
logger = logging.getLogger(__name__)
4751

48-
if specific_doc:
49-
base_url = f"https://docs.redhat.com/en/documentation/openshift_container_platform/{version}/html-single/{specific_doc}"
52+
if doc_url:
53+
base_url = doc_url
54+
elif specific_doc:
55+
base_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{version}/html-single/{specific_doc}"
5056
else:
51-
base_url = f"https://docs.redhat.com/en/documentation/openshift_container_platform/{version}"
57+
base_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{version}"
5258

5359
logger.info("Downloading from: %s", base_url)
5460
logger.info("Output directory: %s", output_dir)

0 commit comments

Comments
 (0)