openshift
diff --git a/‎scripts/portal-fetcher/openshift-docs-downloader.py renamed to ‎scripts/doc_downloader/downloader.py b/‎scripts/portal-fetcher/openshift-docs-downloader.py renamed to ‎scripts/doc_downloader/downloader.py
diff --git a/‎scripts/html_chunking/chunker.py
Lines changed: 11 additions & 11 deletions b/‎scripts/html_chunking/chunker.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎scripts/html_chunking/example.py
Lines changed: 1 addition & 2 deletions b/‎scripts/html_chunking/example.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎scripts/html_chunking/html-stripper.py
Lines changed: 2 additions & 2 deletions b/‎scripts/html_chunking/html-stripper.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/html_chunking/parser.py
Lines changed: 8 additions & 8 deletions b/‎scripts/html_chunking/parser.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎scripts/html_chunking/tokenizer.py
Lines changed: 3 additions & 3 deletions b/‎scripts/html_chunking/tokenizer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/html_embeddings/chunk_html.py
Lines changed: 5 additions & 5 deletions b/‎scripts/html_embeddings/chunk_html.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎scripts/html_embeddings/download_docs.py
Lines changed: 5 additions & 5 deletions b/‎scripts/html_embeddings/download_docs.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎scripts/html_embeddings/generate_embeddings.py
Lines changed: 11 additions & 11 deletions b/‎scripts/html_embeddings/generate_embeddings.py
Lines changed: 11 additions & 11 deletions
@@ -4,7 +4,7 @@
 This module splits HTML content into chunks based on semantic boundaries.
 """
 
-from typing import List, Dict, Any, Optional
+from typing import Any, Optional
 from dataclasses import dataclass
 from bs4 import BeautifulSoup, Tag, NavigableString
 import warnings
@@ -23,7 +23,7 @@ class ChunkingOptions:
 class Chunk:
     """A dataclass to hold a chunk's text and its associated metadata."""
     text: str
-    metadata: Dict[str, Any]
+    metadata: dict[str, Any]
 
 
 def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
@@ -50,7 +50,7 @@ def chunk_html(
     max_token_limit: int = 500,
     count_tag_tokens: bool = True,
     **kwargs
-) -> List[Chunk]:
+) -> list[Chunk]:
     """
     Chunks the given HTML content and generates metadata with source URLs and anchors.
 
@@ -132,7 +132,7 @@ def chunk_html(
     return final_chunks
 
 
-def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[str]:
+def _split_element_by_children(element: Tag, options: ChunkingOptions) -> list[str]:
     chunks, current_chunk_elements, current_tokens = [], [], 0
     children = [child for child in element.children if not (isinstance(child, NavigableString) and not child.strip())]
 
@@ -176,7 +176,7 @@ def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[s
     if current_chunk_elements: chunks.append("".join(current_chunk_elements))
     return chunks
 
-def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOptions) -> List[str]:
+def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOptions) -> list[str]:
     chunks, current_chunk_elements, current_tokens = [], [], 0
     children = [child for child in element.children if not (isinstance(child, NavigableString) and not child.strip())]
 
@@ -210,7 +210,7 @@ def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOption
     if current_chunk_elements: chunks.append("".join(current_chunk_elements))
     return chunks
 
-def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> List[str]:
+def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[str]:
     dl = div_element.find('dl')
     if not dl: return _split_element_by_children(div_element, options)
     chunks, current_chunk_pairs_html, current_tokens = [], [], 0
@@ -234,7 +234,7 @@ def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> List[s
     if current_chunk_pairs_html: chunks.append(f'<div class="variablelist"><dl>{"".join(current_chunk_pairs_html)}</dl></div>')
     return chunks if chunks else [str(div_element)]
 
-def _split_table(table: Tag, options: ChunkingOptions) -> List[str]:
+def _split_table(table: Tag, options: ChunkingOptions) -> list[str]:
     chunks, header = [], table.find('thead')
     rows = table.find_all('tr')
     header_rows_ids = set(id(r) for r in header.find_all('tr')) if header else set()
@@ -259,7 +259,7 @@ def _split_table(table: Tag, options: ChunkingOptions) -> List[str]:
     if current_chunk_rows: chunks.append(table_open + header_html + "".join(current_chunk_rows) + table_close)
     return chunks if chunks else [str(table)]
 
-def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_close: str, options: ChunkingOptions) -> List[str]:
+def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_close: str, options: ChunkingOptions) -> list[str]:
     row_chunks, cells = [], row.find_all(['td', 'th'], recursive=False)
     cell_sub_chunks = [_split_element_by_children(cell, options) for cell in cells]
     max_len = max(len(c) for c in cell_sub_chunks) if cell_sub_chunks else 0
@@ -274,7 +274,7 @@ def _split_oversized_row(row: Tag, table_open: str, header_html: str, table_clos
         row_chunks.append(table_open + header_html + new_row_html + table_close)
     return row_chunks
 
-def _split_list(list_element: Tag, options: ChunkingOptions) -> List[str]:
+def _split_list(list_element: Tag, options: ChunkingOptions) -> list[str]:
     chunks, items = [], list_element.find_all('li', recursive=False)
     list_attrs = " ".join([f'{k}="{v}"' for k, v in list_element.attrs.items()])
     list_open, list_close = f"<{list_element.name} {list_attrs}>", f"</{list_element.name}>"
@@ -299,7 +299,7 @@ def _split_list(list_element: Tag, options: ChunkingOptions) -> List[str]:
     if current_chunk_items: chunks.append(list_open + "".join(current_chunk_items) + list_close)
     return chunks if chunks else [str(list_element)]
 
-def _split_code(pre_element: Tag, options: ChunkingOptions) -> List[str]:
+def _split_code(pre_element: Tag, options: ChunkingOptions) -> list[str]:
     chunks, code_text = [], pre_element.get_text()
     lines = code_text.split('\n')
     attrs = " ".join([f'{k}="{v}"' for k, v in pre_element.attrs.items()])
@@ -316,7 +316,7 @@ def _split_code(pre_element: Tag, options: ChunkingOptions) -> List[str]:
     if current_chunk_lines: chunks.append(open_tag + "\n".join(current_chunk_lines) + close_tag)
     return chunks if chunks else [str(pre_element)]
 
-def _linear_split(html_content: str, options: ChunkingOptions) -> List[str]:
+def _linear_split(html_content: str, options: ChunkingOptions) -> list[str]:
     warnings.warn("Using linear character split as a fallback for an oversized, indivisible chunk.")
     chars_per_chunk = int(options.max_token_limit * DEFAULT_CHARS_PER_TOKEN_RATIO)
     return [html_content[i:i + chars_per_chunk] for i in range(0, len(html_content), chars_per_chunk)]
@@ -9,7 +9,6 @@
 import argparse
 import os
 import sys
-from typing import List
 
 # Imports are deferred into main() to support running the script
 # from within its directory, which requires a sys.path modification first.
@@ -39,7 +38,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
     )
     return parser
 
-def generate_html_report(output_path: str, chunks: List['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None:
+def generate_html_report(output_path: str, chunks: list['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None:
     """Generates a single HTML file containing all chunks for review."""
     print(f"\nSaving all chunks to a single file: {output_path}...")
 
 
@@ -12,7 +12,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import List, Optional
+from typing import Optional
 from bs4 import BeautifulSoup, Tag
 
 # Constants
@@ -168,7 +168,7 @@ def process_directory(
     output_dir: str,
     strip_mode: str,
     strip_links: bool,
-    exclusion_list: Optional[List[str]] = None
+    exclusion_list: Optional[list[str]] = None
 ) -> None:
     """
     Process all HTML files in a directory and its subdirectories.
 
@@ -2,7 +2,7 @@
 HTML parser module for identifying document structure.
 """
 
-from typing import List, Dict, Tuple, Optional, Union, Set, Any
+from typing import Tuple, Optional, Union, Set, Any
 from bs4 import BeautifulSoup, Tag, NavigableString
 import re
 from dataclasses import dataclass, field
@@ -26,8 +26,8 @@ class HtmlSection:
     heading_tag: Optional[Tag] = None
     level: int = 0
     parent: Optional['HtmlSection'] = None
-    content: List[Union[Tag, NavigableString, 'HtmlSection']] = field(default_factory=list)
-    children: List['HtmlSection'] = field(default_factory=list)
+    content: list[Union[Tag, NavigableString, 'HtmlSection']] = field(default_factory=list)
+    children: list['HtmlSection'] = field(default_factory=list)
     html: str = ""
 
     def add_content(self, content: Union[Tag, NavigableString, 'HtmlSection']) -> None:
@@ -196,7 +196,7 @@ def _get_element_position(soup: BeautifulSoup, element: Tag) -> int:
     return -1
 
 
-def _flatten_sections(section: HtmlSection) -> List[HtmlSection]:
+def _flatten_sections(section: HtmlSection) -> list[HtmlSection]:
     """
     Flatten a section hierarchy into a list.
     
@@ -212,7 +212,7 @@ def _flatten_sections(section: HtmlSection) -> List[HtmlSection]:
     return result
 
 
-def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
+def identify_special_sections(soup: BeautifulSoup) -> dict[str, list[dict]]:
     """
     Identify special sections in the HTML that need special handling during chunking.
     
@@ -238,7 +238,7 @@ def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
         }
 
 
-def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
+def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
     """
     Identify procedure sections in the HTML.
     
@@ -413,7 +413,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
     return None
 
 
-def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
+def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
     """
     Identify code blocks in the HTML.
     
@@ -488,7 +488,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
         return []
 
 
-def identify_tables(soup: BeautifulSoup) -> List[Dict]:
+def identify_tables(soup: BeautifulSoup) -> list[dict]:
     """
     Identify tables in the HTML.
     
 
@@ -2,7 +2,7 @@
 Tokenizer module for HTML content.
 """
 
-from typing import Optional, List, Union, Callable
+from typing import Optional, Union, Callable
 from bs4 import BeautifulSoup
 import re
 import sys
@@ -30,7 +30,7 @@ class TokenCounter:
     A class that counts tokens in text using LlamaIndex or HuggingFace tokenizers.
     """
 
-    def __init__(self, custom_tokenizer: Optional[Callable[[str], List[str]]] = None) -> None:
+    def __init__(self, custom_tokenizer: Optional[Callable[[str], list[str]]] = None) -> None:
         """
         Initialize the TokenCounter.
         
@@ -168,7 +168,7 @@ def count_html_tokens(html_text: str, count_tag_tokens: bool = True) -> int:
     return token_counter.count_html_tokens(html_text, count_tag_tokens)
 
 
-def set_custom_tokenizer(tokenizer_func: Callable[[str], List[str]]) -> None:
+def set_custom_tokenizer(tokenizer_func: Callable[[str], list[str]]) -> None:
     """
     Set a custom tokenizer function for the global TokenCounter instance.
     
 
@@ -8,7 +8,7 @@
 import sys
 from bs4 import BeautifulSoup
 from pathlib import Path
-from typing import Dict, List, Any, Optional
+from typing import Any, Optional
 from urllib.parse import urlparse
 
 # Import the HTML chunking library
@@ -177,7 +177,7 @@ def chunk_single_html_file(
             logger.warning("Empty file: %s", input_file)
             return True, 0
 
-        chunks: List[Chunk] = chunk_html(
+        chunks: list[Chunk] = chunk_html(
             html_content=html_content,
             source_url=source_url,
             max_token_limit=max_token_limit,
@@ -231,7 +231,7 @@ def chunk_single_html_file(
         return False, 0
 
 
-def extract_metadata_from_path(file_path: Path, product_slug: str) -> Dict[str, Any]:
+def extract_metadata_from_path(file_path: Path, product_slug: str) -> dict[str, Any]:
     """
     Extract metadata from file path.
 
@@ -268,7 +268,7 @@ def extract_metadata_from_path(file_path: Path, product_slug: str) -> Dict[str,
     }
 
 
-def validate_chunks(output_dir: Path, max_token_limit: int) -> Dict[str, Any]:
+def validate_chunks(output_dir: Path, max_token_limit: int) -> dict[str, Any]:
     """
     Validate generated chunks.
 
@@ -344,7 +344,7 @@ def validate_chunks(output_dir: Path, max_token_limit: int) -> Dict[str, Any]:
     return validation_results
 
 
-def get_chunking_stats(output_dir: Path) -> Dict[str, Any]:
+def get_chunking_stats(output_dir: Path) -> dict[str, Any]:
     """
     Get statistics about chunked documents.
 
 
@@ -10,13 +10,13 @@
 from typing import Optional
 
 portal_fetcher_path = (
-    Path(__file__).parent.parent / "portal-fetcher" / "openshift-docs-downloader.py"
+    Path(__file__).parent.parent / "doc_downloader" / "downloader.py"
 )
 spec = importlib.util.spec_from_file_location(
-    "openshift_docs_downloader", portal_fetcher_path
+    "downloader", portal_fetcher_path
 )
-openshift_docs_downloader = importlib.util.module_from_spec(spec)
-spec.loader.exec_module(openshift_docs_downloader)
+downloader = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(downloader)
 
 
 def download_documentation(
@@ -61,7 +61,7 @@ def download_documentation(
 
     try:
         verification_passed, toc_verification_passed, elapsed_time = asyncio.run(
-            openshift_docs_downloader.run_downloader(
+            downloader.run_downloader(
                 base_url=base_url,
                 output_dir=str(output_dir),
                 concurrency=concurrency,
 
@@ -16,7 +16,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Optional, Dict, Any, List
+from typing import Optional, Any
 
 import re
 import yaml
@@ -180,7 +180,7 @@ def parse_arguments() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def setup_environment(args: argparse.Namespace, product: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Path]]:
+def setup_environment(args: argparse.Namespace, product: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Path]]:
     """Setup environment and validate dependencies."""
     logger = setup_logging(verbose=args.verbose)
 
@@ -237,8 +237,8 @@ def setup_environment(args: argparse.Namespace, product: Dict[str, Any]) -> tupl
 
 def run_download_step(
     args: argparse.Namespace,
-    paths: Dict[str, Path],
-    product: Dict[str, Any],
+    paths: dict[str, Path],
+    product: dict[str, Any],
     logger,
 ) -> bool:
     """Run the documentation download step."""
@@ -278,7 +278,7 @@ def run_download_step(
         return False
 
 
-def run_strip_step(args: argparse.Namespace, paths: Dict[str, Path], logger) -> bool:
+def run_strip_step(args: argparse.Namespace, paths: dict[str, Path], logger) -> bool:
     """Run the HTML stripping step."""
     downloads_dir = paths["downloads"]
     stripped_dir = paths["stripped"]
@@ -299,8 +299,8 @@ def run_strip_step(args: argparse.Namespace, paths: Dict[str, Path], logger) ->
 
 def run_chunk_step(
     args: argparse.Namespace,
-    paths: Dict[str, Path],
-    product: Dict[str, Any],
+    paths: dict[str, Path],
+    product: dict[str, Any],
     logger,
 ) -> bool:
     """Run the HTML chunking step."""
@@ -335,7 +335,7 @@ def run_chunk_step(
         return False
 
 
-def run_runbooks_step(args: argparse.Namespace, paths: Dict[str, Path], logger) -> bool:
+def run_runbooks_step(args: argparse.Namespace, paths: dict[str, Path], logger) -> bool:
     """Run the runbooks processing step."""
     if args.skip_runbooks:
         logger.info("Skipping runbooks processing")
@@ -366,7 +366,7 @@ def run_runbooks_step(args: argparse.Namespace, paths: Dict[str, Path], logger)
         return False
 
 
-def load_chunks_as_nodes(chunks_dir: Path, logger) -> List[TextNode]:
+def load_chunks_as_nodes(chunks_dir: Path, logger) -> list[TextNode]:
     """Load all chunks as TextNode objects."""
     nodes = []
 
@@ -399,8 +399,8 @@ def load_chunks_as_nodes(chunks_dir: Path, logger) -> List[TextNode]:
 
 def run_embedding_step(
     args: argparse.Namespace,
-    paths: Dict[str, Path],
-    product: Dict[str, Any],
+    paths: dict[str, Path],
+    product: dict[str, Any],
     logger,
 ) -> bool:
     """Run the embedding generation step."""