Skip to content

Commit 49e0a37

Browse files
committed
Dict -> dict; List -> list; if/elif -> match/case
1 parent 98b6e69 commit 49e0a37

File tree

13 files changed

+74
-74
lines changed

13 files changed

+74
-74
lines changed

scripts/html_chunking/chunker.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
This module splits HTML content into chunks based on semantic boundaries.
55
"""
66

7-
from typing import List, Dict, Any, Optional
7+
from typing import Any, Optional
88
from dataclasses import dataclass
99
from bs4 import BeautifulSoup, Tag, NavigableString
1010
import warnings
@@ -23,7 +23,7 @@ class ChunkingOptions:
2323
class Chunk:
2424
"""A dataclass to hold a chunk's text and its associated metadata."""
2525
text: str
26-
metadata: Dict[str, Any]
26+
metadata: dict[str, Any]
2727

2828

2929
def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
@@ -50,7 +50,7 @@ def chunk_html(
5050
max_token_limit: int = 500,
5151
count_tag_tokens: bool = True,
5252
**kwargs
53-
) -> List[Chunk]:
53+
) -> list[Chunk]:
5454
"""
5555
Chunks the given HTML content and generates metadata with source URLs and anchors.
5656
@@ -132,7 +132,7 @@ def chunk_html(
132132
return final_chunks
133133

134134

135-
def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[str]:
135+
def _split_element_by_children(element: Tag, options: ChunkingOptions) -> list[str]:
136136
chunks, current_chunk_elements, current_tokens = [], [], 0
137137
children = [child for child in element.children if not (isinstance(child, NavigableString) and not child.strip())]
138138

@@ -176,7 +176,7 @@ def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[s
176176
if current_chunk_elements: chunks.append("".join(current_chunk_elements))
177177
return chunks
178178

179-
def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOptions) -> List[str]:
179+
def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOptions) -> list[str]:
180180
chunks, current_chunk_elements, current_tokens = [], [], 0
181181
children = [child for child in element.children if not (isinstance(child, NavigableString) and not child.strip())]
182182

@@ -210,7 +210,7 @@ def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOption
210210
if current_chunk_elements: chunks.append("".join(current_chunk_elements))
211211
return chunks
212212

213-
def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> List[str]:
213+
def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[str]:
214214
dl = div_element.find('dl')
215215
if not dl: return _split_element_by_children(div_element, options)
216216
chunks, current_chunk_pairs_html, current_tokens = [], [], 0
@@ -234,7 +234,7 @@ def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> List[s
234234
if current_chunk_pairs_html: chunks.append(f'<div class="variablelist"><dl>{"".join(current_chunk_pairs_html)}</dl></div>')
235235
return chunks if chunks else [str(div_element)]
236236

237-
def _split_table(table: Tag, options: ChunkingOptions) -> List[str]:
237+
def _split_table(table: Tag, options: ChunkingOptions) -> list[str]:
238238
chunks, header = [], table.find('thead')
239239
rows = table.find_all('tr')
240240
header_rows_ids = set(id(r) for r in header.find_all('tr')) if header else set()
@@ -259,7 +259,7 @@ def _split_table(table: Tag, options: ChunkingOptions) -> List[str]:
259259
if current_chunk_rows: chunks.append(table_open + header_html + "".join(current_chunk_rows) + table_close)
260260
return chunks if chunks else [str(table)]
261261

262-
def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_close: str, options: ChunkingOptions) -> List[str]:
262+
def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_close: str, options: ChunkingOptions) -> list[str]:
263263
row_chunks, cells = [], row.find_all(['td', 'th'], recursive=False)
264264
cell_sub_chunks = [_split_element_by_children(cell, options) for cell in cells]
265265
max_len = max(len(c) for c in cell_sub_chunks) if cell_sub_chunks else 0
@@ -274,7 +274,7 @@ def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_clos
274274
row_chunks.append(table_open + header_html + new_row_html + table_close)
275275
return row_chunks
276276

277-
def _split_list(list_element: Tag, options: ChunkingOptions) -> List[str]:
277+
def _split_list(list_element: Tag, options: ChunkingOptions) -> list[str]:
278278
chunks, items = [], list_element.find_all('li', recursive=False)
279279
list_attrs = " ".join([f'{k}="{v}"' for k, v in list_element.attrs.items()])
280280
list_open, list_close = f"<{list_element.name} {list_attrs}>", f"</{list_element.name}>"
@@ -299,7 +299,7 @@ def _split_list(list_element: Tag, options: ChunkingOptions) -> List[str]:
299299
if current_chunk_items: chunks.append(list_open + "".join(current_chunk_items) + list_close)
300300
return chunks if chunks else [str(list_element)]
301301

302-
def _split_code(pre_element: Tag, options: ChunkingOptions) -> List[str]:
302+
def _split_code(pre_element: Tag, options: ChunkingOptions) -> list[str]:
303303
chunks, code_text = [], pre_element.get_text()
304304
lines = code_text.split('\n')
305305
attrs = " ".join([f'{k}="{v}"' for k, v in pre_element.attrs.items()])
@@ -316,7 +316,7 @@ def _split_code(pre_element: Tag, options: ChunkingOptions) -> List[str]:
316316
if current_chunk_lines: chunks.append(open_tag + "\n".join(current_chunk_lines) + close_tag)
317317
return chunks if chunks else [str(pre_element)]
318318

319-
def _linear_split(html_content: str, options: ChunkingOptions) -> List[str]:
319+
def _linear_split(html_content: str, options: ChunkingOptions) -> list[str]:
320320
warnings.warn("Using linear character split as a fallback for an oversized, indivisible chunk.")
321321
chars_per_chunk = int(options.max_token_limit * DEFAULT_CHARS_PER_TOKEN_RATIO)
322322
return [html_content[i:i + chars_per_chunk] for i in range(0, len(html_content), chars_per_chunk)]

scripts/html_chunking/example.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import argparse
1010
import os
1111
import sys
12-
from typing import List
1312

1413
# Imports are deferred into main() to support running the script
1514
# from within its directory, which requires a sys.path modification first.
@@ -39,7 +38,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
3938
)
4039
return parser
4140

42-
def generate_html_report(output_path: str, chunks: List['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None:
41+
def generate_html_report(output_path: str, chunks: list['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None:
4342
"""Generates a single HTML file containing all chunks for review."""
4443
print(f"\nSaving all chunks to a single file: {output_path}...")
4544

scripts/html_chunking/html-stripper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import os
1313
import sys
1414
from pathlib import Path
15-
from typing import List, Optional
15+
from typing import Optional
1616
from bs4 import BeautifulSoup, Tag
1717

1818
# Constants
@@ -168,7 +168,7 @@ def process_directory(
168168
output_dir: str,
169169
strip_mode: str,
170170
strip_links: bool,
171-
exclusion_list: Optional[List[str]] = None
171+
exclusion_list: Optional[list[str]] = None
172172
) -> None:
173173
"""
174174
Process all HTML files in a directory and its subdirectories.

scripts/html_chunking/parser.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
HTML parser module for identifying document structure.
33
"""
44

5-
from typing import List, Dict, Tuple, Optional, Union, Set, Any
5+
from typing import Tuple, Optional, Union, Set, Any
66
from bs4 import BeautifulSoup, Tag, NavigableString
77
import re
88
from dataclasses import dataclass, field
@@ -26,8 +26,8 @@ class HtmlSection:
2626
heading_tag: Optional[Tag] = None
2727
level: int = 0
2828
parent: Optional['HtmlSection'] = None
29-
content: List[Union[Tag, NavigableString, 'HtmlSection']] = field(default_factory=list)
30-
children: List['HtmlSection'] = field(default_factory=list)
29+
content: list[Union[Tag, NavigableString, 'HtmlSection']] = field(default_factory=list)
30+
children: list['HtmlSection'] = field(default_factory=list)
3131
html: str = ""
3232

3333
def add_content(self, content: Union[Tag, NavigableString, 'HtmlSection']) -> None:
@@ -196,7 +196,7 @@ def _get_element_position(soup: BeautifulSoup, element: Tag) -> int:
196196
return -1
197197

198198

199-
def _flatten_sections(section: HtmlSection) -> List[HtmlSection]:
199+
def _flatten_sections(section: HtmlSection) -> list[HtmlSection]:
200200
"""
201201
Flatten a section hierarchy into a list.
202202
@@ -212,7 +212,7 @@ def _flatten_sections(section: HtmlSection) -> List[HtmlSection]:
212212
return result
213213

214214

215-
def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
215+
def identify_special_sections(soup: BeautifulSoup) -> dict[str, list[dict]]:
216216
"""
217217
Identify special sections in the HTML that need special handling during chunking.
218218
@@ -238,7 +238,7 @@ def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
238238
}
239239

240240

241-
def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
241+
def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
242242
"""
243243
Identify procedure sections in the HTML.
244244
@@ -413,7 +413,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
413413
return None
414414

415415

416-
def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
416+
def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
417417
"""
418418
Identify code blocks in the HTML.
419419
@@ -488,7 +488,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
488488
return []
489489

490490

491-
def identify_tables(soup: BeautifulSoup) -> List[Dict]:
491+
def identify_tables(soup: BeautifulSoup) -> list[dict]:
492492
"""
493493
Identify tables in the HTML.
494494

scripts/html_chunking/tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Tokenizer module for HTML content.
33
"""
44

5-
from typing import Optional, List, Union, Callable
5+
from typing import Optional, Union, Callable
66
from bs4 import BeautifulSoup
77
import re
88
import sys
@@ -30,7 +30,7 @@ class TokenCounter:
3030
A class that counts tokens in text using LlamaIndex or HuggingFace tokenizers.
3131
"""
3232

33-
def __init__(self, custom_tokenizer: Optional[Callable[[str], List[str]]] = None) -> None:
33+
def __init__(self, custom_tokenizer: Optional[Callable[[str], list[str]]] = None) -> None:
3434
"""
3535
Initialize the TokenCounter.
3636
@@ -168,7 +168,7 @@ def count_html_tokens(html_text: str, count_tag_tokens: bool = True) -> int:
168168
return token_counter.count_html_tokens(html_text, count_tag_tokens)
169169

170170

171-
def set_custom_tokenizer(tokenizer_func: Callable[[str], List[str]]) -> None:
171+
def set_custom_tokenizer(tokenizer_func: Callable[[str], list[str]]) -> None:
172172
"""
173173
Set a custom tokenizer function for the global TokenCounter instance.
174174

scripts/html_embeddings/chunk_html.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import sys
99
from bs4 import BeautifulSoup
1010
from pathlib import Path
11-
from typing import Dict, List, Any, Optional
11+
from typing import Any, Optional
1212
from urllib.parse import urlparse
1313

1414
# Import the HTML chunking library
@@ -177,7 +177,7 @@ def chunk_single_html_file(
177177
logger.warning("Empty file: %s", input_file)
178178
return True, 0
179179

180-
chunks: List[Chunk] = chunk_html(
180+
chunks: list[Chunk] = chunk_html(
181181
html_content=html_content,
182182
source_url=source_url,
183183
max_token_limit=max_token_limit,
@@ -231,7 +231,7 @@ def chunk_single_html_file(
231231
return False, 0
232232

233233

234-
def extract_metadata_from_path(file_path: Path, product_slug: str) -> Dict[str, Any]:
234+
def extract_metadata_from_path(file_path: Path, product_slug: str) -> dict[str, Any]:
235235
"""
236236
Extract metadata from file path.
237237
@@ -268,7 +268,7 @@ def extract_metadata_from_path(file_path: Path, product_slug: str) -> Dict[str,
268268
}
269269

270270

271-
def validate_chunks(output_dir: Path, max_token_limit: int) -> Dict[str, Any]:
271+
def validate_chunks(output_dir: Path, max_token_limit: int) -> dict[str, Any]:
272272
"""
273273
Validate generated chunks.
274274
@@ -344,7 +344,7 @@ def validate_chunks(output_dir: Path, max_token_limit: int) -> Dict[str, Any]:
344344
return validation_results
345345

346346

347-
def get_chunking_stats(output_dir: Path) -> Dict[str, Any]:
347+
def get_chunking_stats(output_dir: Path) -> dict[str, Any]:
348348
"""
349349
Get statistics about chunked documents.
350350

scripts/html_embeddings/download_docs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
from typing import Optional
1111

1212
portal_fetcher_path = (
13-
Path(__file__).parent.parent / "portal-fetcher" / "openshift-docs-downloader.py"
13+
Path(__file__).parent.parent / "doc_downloader" / "downloader.py"
1414
)
1515
spec = importlib.util.spec_from_file_location(
16-
"openshift_docs_downloader", portal_fetcher_path
16+
"downloader", portal_fetcher_path
1717
)
18-
openshift_docs_downloader = importlib.util.module_from_spec(spec)
19-
spec.loader.exec_module(openshift_docs_downloader)
18+
downloader = importlib.util.module_from_spec(spec)
19+
spec.loader.exec_module(downloader)
2020

2121

2222
def download_documentation(
@@ -61,7 +61,7 @@ def download_documentation(
6161

6262
try:
6363
verification_passed, toc_verification_passed, elapsed_time = asyncio.run(
64-
openshift_docs_downloader.run_downloader(
64+
downloader.run_downloader(
6565
base_url=base_url,
6666
output_dir=str(output_dir),
6767
concurrency=concurrency,

scripts/html_embeddings/generate_embeddings.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import sys
1717
import time
1818
from pathlib import Path
19-
from typing import Optional, Dict, Any, List
19+
from typing import Optional, Any
2020

2121
import re
2222
import yaml
@@ -180,7 +180,7 @@ def parse_arguments() -> argparse.Namespace:
180180
return parser.parse_args()
181181

182182

183-
def setup_environment(args: argparse.Namespace, product: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Path]]:
183+
def setup_environment(args: argparse.Namespace, product: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Path]]:
184184
"""Setup environment and validate dependencies."""
185185
logger = setup_logging(verbose=args.verbose)
186186

@@ -237,8 +237,8 @@ def setup_environment(args: argparse.Namespace, product: Dict[str, Any]) -> tupl
237237

238238
def run_download_step(
239239
args: argparse.Namespace,
240-
paths: Dict[str, Path],
241-
product: Dict[str, Any],
240+
paths: dict[str, Path],
241+
product: dict[str, Any],
242242
logger,
243243
) -> bool:
244244
"""Run the documentation download step."""
@@ -278,7 +278,7 @@ def run_download_step(
278278
return False
279279

280280

281-
def run_strip_step(args: argparse.Namespace, paths: Dict[str, Path], logger) -> bool:
281+
def run_strip_step(args: argparse.Namespace, paths: dict[str, Path], logger) -> bool:
282282
"""Run the HTML stripping step."""
283283
downloads_dir = paths["downloads"]
284284
stripped_dir = paths["stripped"]
@@ -299,8 +299,8 @@ def run_strip_step(args: argparse.Namespace, paths: Dict[str, Path], logger) ->
299299

300300
def run_chunk_step(
301301
args: argparse.Namespace,
302-
paths: Dict[str, Path],
303-
product: Dict[str, Any],
302+
paths: dict[str, Path],
303+
product: dict[str, Any],
304304
logger,
305305
) -> bool:
306306
"""Run the HTML chunking step."""
@@ -335,7 +335,7 @@ def run_chunk_step(
335335
return False
336336

337337

338-
def run_runbooks_step(args: argparse.Namespace, paths: Dict[str, Path], logger) -> bool:
338+
def run_runbooks_step(args: argparse.Namespace, paths: dict[str, Path], logger) -> bool:
339339
"""Run the runbooks processing step."""
340340
if args.skip_runbooks:
341341
logger.info("Skipping runbooks processing")
@@ -366,7 +366,7 @@ def run_runbooks_step(args: argparse.Namespace, paths: Dict[str, Path], logger)
366366
return False
367367

368368

369-
def load_chunks_as_nodes(chunks_dir: Path, logger) -> List[TextNode]:
369+
def load_chunks_as_nodes(chunks_dir: Path, logger) -> list[TextNode]:
370370
"""Load all chunks as TextNode objects."""
371371
nodes = []
372372

@@ -399,8 +399,8 @@ def load_chunks_as_nodes(chunks_dir: Path, logger) -> List[TextNode]:
399399

400400
def run_embedding_step(
401401
args: argparse.Namespace,
402-
paths: Dict[str, Path],
403-
product: Dict[str, Any],
402+
paths: dict[str, Path],
403+
product: dict[str, Any],
404404
logger,
405405
) -> bool:
406406
"""Run the embedding generation step."""

0 commit comments

Comments
 (0)