20
20
def chunk_html_documents (
21
21
input_dir : Path ,
22
22
output_dir : Path ,
23
+ product_slug : str ,
24
+ product_version : str ,
25
+ doc_url : Optional [str ] = None ,
23
26
max_token_limit : int = 380 ,
24
27
count_tag_tokens : bool = True ,
25
28
keep_siblings_together : bool = True ,
@@ -31,6 +34,9 @@ def chunk_html_documents(
31
34
Args:
32
35
input_dir: Directory containing stripped HTML files
33
36
output_dir: Directory to save chunked content
37
+ product_slug: Product URL slug
38
+ product_version: Product version
39
+ doc_url: The full URL to the documentation.
34
40
max_token_limit: Maximum tokens per chunk
35
41
count_tag_tokens: Whether to count HTML tags in token count
36
42
keep_siblings_together: Keep sibling sections together when possible
@@ -75,22 +81,24 @@ def chunk_html_documents(
75
81
logger .debug ("Processing %s" , html_file )
76
82
77
83
# The doc name is the parent directory of the html file.
78
- # The version is the parent of that directory.
79
84
doc_name = html_file .parent .name
80
- version = html_file .parent .parent .name
81
85
82
86
# The main output_dir is the version dir, e.g., '.../chunks/4.18'.
83
87
# We create the doc-specific subdirectory here.
84
88
doc_specific_output_dir = output_dir / doc_name
85
89
86
90
# Construct the source URL, which will be passed to the chunker.
87
- source_url = f"https://docs.redhat.com/en/documentation/openshift_container_platform/{ version } /html-single/{ doc_name } /"
91
+ if doc_url :
92
+ source_url = doc_url
93
+ else :
94
+ source_url = f"https://docs.redhat.com/en/documentation/{ product_slug } /{ product_version } /html-single/{ doc_name } /"
88
95
89
96
success , chunk_count = chunk_single_html_file (
90
97
input_file = html_file .resolve (),
91
- output_dir = doc_specific_output_dir , # Pass the new doc-specific dir
92
- input_base_dir = base_dir_for_relative_paths .resolve (), # Pass the consistent version-level base path
98
+ output_dir = doc_specific_output_dir ,
99
+ input_base_dir = base_dir_for_relative_paths .resolve (),
93
100
source_url = source_url ,
101
+ product_slug = product_slug ,
94
102
max_token_limit = max_token_limit ,
95
103
count_tag_tokens = count_tag_tokens ,
96
104
keep_siblings_together = keep_siblings_together ,
@@ -133,7 +141,8 @@ def chunk_single_html_file(
133
141
input_file : Path ,
134
142
output_dir : Path ,
135
143
input_base_dir : Path ,
136
- source_url : str , # Add source_url parameter
144
+ source_url : str ,
145
+ product_slug : str ,
137
146
max_token_limit : int = 380 ,
138
147
count_tag_tokens : bool = True ,
139
148
keep_siblings_together : bool = True ,
@@ -147,6 +156,7 @@ def chunk_single_html_file(
147
156
output_dir: Directory to save chunks
148
157
input_base_dir: Base directory for input files (for relative path calculation)
149
158
source_url: The public URL of the source document
159
+ product_slug: Product URL slug
150
160
max_token_limit: Maximum tokens per chunk
151
161
count_tag_tokens: Whether to count HTML tags
152
162
keep_siblings_together: Keep sibling sections together
@@ -179,7 +189,7 @@ def chunk_single_html_file(
179
189
return True , 0
180
190
181
191
relative_path = input_file .relative_to (input_base_dir )
182
- base_metadata = extract_metadata_from_path (relative_path )
192
+ base_metadata = extract_metadata_from_path (relative_path , product_slug )
183
193
184
194
chunk_count = 0
185
195
for i , chunk_obj in enumerate (chunks ):
@@ -221,12 +231,13 @@ def chunk_single_html_file(
221
231
return False , 0
222
232
223
233
224
- def extract_metadata_from_path (file_path : Path ) -> Dict [str , Any ]:
234
+ def extract_metadata_from_path (file_path : Path , product_slug : str ) -> Dict [str , Any ]:
225
235
"""
226
236
Extract metadata from file path.
227
237
228
238
Args:
229
239
file_path: Relative path to the file
240
+ product_slug: Product URL slug
230
241
231
242
Returns:
232
243
Dictionary with extracted metadata
@@ -253,7 +264,7 @@ def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
253
264
"doc_id" : doc_id ,
254
265
"version" : version ,
255
266
"file_path" : str (file_path ),
256
- "doc_type" : "openshift_documentation " ,
267
+ "doc_type" : f" { product_slug } _documentation " ,
257
268
}
258
269
259
270
0 commit comments