Implement feedback across all chunking codebase

max-svistunov · max-svistunov · commit 205dd78de3d0 · 2025-07-15T14:24:00.000+02:00
diff --git a/scripts/doc_downloader/downloader.py b/scripts/doc_downloader/downloader.py
@@ -45,7 +45,7 @@ def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -
     parsed_url = urlparse(url)
     path = parsed_url.path
 
-    if base_url:
+    if base_url is not None:
         parsed_base_url = urlparse(base_url)
         base_path = parsed_base_url.path
         # If base_url is a directory-like path, ensure it ends with a slash for clean prefix removal
@@ -57,7 +57,7 @@ def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -
 
     path = path.lstrip("/")
 
-    if not path or path.endswith('/'):
+    if path == "" or path.endswith('/'):
         path = path + "index.html"
     
     local_path = output_dir / path
@@ -163,7 +163,7 @@ async def download_page(
             logger.error("Error downloading %s: %s", url, e)
         
         if attempt < max_retries - 1:
-            await asyncio.sleep(1)
+            await asyncio.sleep(2 ** attempt)
 
     record_download(db_path, url, str(local_path), status="failed")
     return url, False
@@ -227,7 +227,6 @@ async def run_downloader(
     concurrency: int,
     force: bool,
     max_retries: int,
-    **kwargs, # Absorb unused arguments
 ) -> tuple[bool, bool, float]:
     """Run the complete download process."""
     output_dir_path = Path(output_dir)
diff --git a/scripts/html_chunking/chunker.py b/scripts/html_chunking/chunker.py
@@ -41,7 +41,7 @@ def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
 def get_document_title(soup: BeautifulSoup) -> str:
     """Extracts the document title from the <title> tag."""
     title_tag = soup.find('title')
-    return title_tag.get_text(strip=True) if title_tag else "Untitled"
+    return title_tag.get_text(strip=True) if title_tag is not None else "Untitled"
 
 
 def chunk_html(
@@ -212,7 +212,7 @@ def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOption
 
 def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[str]:
     dl = div_element.find('dl')
-    if not dl: return _split_element_by_children(div_element, options)
+    if dl is None: return _split_element_by_children(div_element, options)
     chunks, current_chunk_pairs_html, current_tokens = [], [], 0
     pairs, children, i = [], list(dl.children), 0
     while i < len(children):
@@ -237,11 +237,11 @@ def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[s
 def _split_table(table: Tag, options: ChunkingOptions) -> list[str]:
     chunks, header = [], table.find('thead')
     rows = table.find_all('tr')
-    header_rows_ids = set(id(r) for r in header.find_all('tr')) if header else set()
+    header_rows_ids = set(id(r) for r in header.find_all('tr')) if header is not None else set()
     body_rows = [row for row in rows if id(row) not in header_rows_ids]
     table_attrs = " ".join([f'{k}="{v}"' for k, v in table.attrs.items()])
     table_open, table_close = f"<table {table_attrs}>", "</table>"
-    header_html = str(header) if header else ""
+    header_html = str(header) if header is not None else ""
     base_tokens = count_html_tokens(table_open + header_html + table_close, options.count_tag_tokens)
     current_chunk_rows, current_tokens = [], base_tokens
     for row in body_rows:
@@ -285,7 +285,7 @@ def _split_list(list_element: Tag, options: ChunkingOptions) -> list[str]:
         if item_tokens + base_tokens > options.max_token_limit:
             if current_chunk_items: chunks.append(list_open + "".join(current_chunk_items) + list_close)
             item_soup = BeautifulSoup(item_html, 'html.parser').li
-            if item_soup:
+            if item_soup is not None:
                  sub_chunks = _split_element_by_children(item_soup, options)
                  for sub_chunk in sub_chunks: chunks.append(list_open + f"<li>{sub_chunk}</li>" + list_close)
             else: chunks.append(list_open + item_html + list_close)
diff --git a/scripts/html_chunking/html-stripper.py b/scripts/html_chunking/html-stripper.py
@@ -48,7 +48,7 @@ def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bo
         state = rh_alert.get('state', 'note')
         content_div = rh_alert.find('div', slot=None) or rh_alert.find('p')
 
-        if content_div:
+        if content_div is not None:
             new_div = soup.new_tag('div')
             new_div['class'] = f'alert alert-{state}'
             new_div.extend(content_div.contents)
diff --git a/scripts/html_chunking/parser.py b/scripts/html_chunking/parser.py
@@ -41,7 +41,7 @@ def add_child(self, child: 'HtmlSection') -> None:
     
     def get_heading_text(self) -> str:
         """Get the text of the heading for this section."""
-        if self.heading_tag:
+        if self.heading_tag is not None:
             try:
                 return self.heading_tag.get_text(strip=True)
             except Exception:
@@ -56,7 +56,7 @@ def get_html(self) -> str:
         
         try:
             result = []
-            if self.heading_tag:
+            if self.heading_tag is not None:
                 result.append(str(self.heading_tag))
             
             for item in self.content:
@@ -69,7 +69,7 @@ def get_html(self) -> str:
             return self.html
         except Exception as e:
             # Fallback in case of error
-            if self.heading_tag:
+            if self.heading_tag is not None:
                 return str(self.heading_tag)
             return ""
 
@@ -97,7 +97,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
         # First pass: identify all headings
         all_headings = []
         for element in body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
-            if element.name and re.match(r'h[1-6]$', element.name):
+            if element.name is not None and re.match(r'h[1-6]$', element.name):
                 level = int(element.name[1])
                 all_headings.append((element, level))
         
@@ -144,16 +144,16 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
             current_section = root_section
             
             for element in body.children:
-                if not element or (isinstance(element, str) and not element.strip()):
+                if element is None or (isinstance(element, str) and not element.strip()):
                     continue
                     
                 is_section_start = False
                 new_level = None
                 
-                if isinstance(element, Tag) and element.name and re.match(r'h[1-6]$', element.name):
+                if isinstance(element, Tag) and element.name is not None and re.match(r'h[1-6]$', element.name):
                     level = int(element.name[1])
                     for section in _flatten_sections(root_section):
-                        if section.heading_tag and section.heading_tag == element:
+                        if section.heading_tag is not None and section.heading_tag == element:
                             is_section_start = True
                             new_level = level
                             current_section = section
@@ -163,7 +163,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
                     current_section.add_content(element)
         else:
             for element in body.children:
-                if element:
+                if element is not None:
                     root_section.add_content(element)
         
         return soup, root_section
@@ -173,7 +173,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
         root_section = HtmlSection()
         
         for element in soup.children:
-            if element:
+            if element is not None:
                 root_section.add_content(element)
                 
         return soup, root_section
@@ -261,15 +261,15 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
         # Multiple ways to identify procedures
         procedure_markers = []
         for element in soup.find_all(string=lambda text: text and "Procedure" in text):
-            if element.parent and element.parent.name not in ('script', 'style'):
+            if element.parent is not None and element.parent.name not in ('script', 'style'):
                 procedure_markers.append(element)
                 
         ordered_lists = soup.find_all('ol')
         
         processed_lists = set()
         
         for marker in procedure_markers:
-            if not marker or not marker.parent:
+            if marker is None or marker.parent is None:
                 continue
                 
             ol = None
@@ -283,24 +283,24 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
                     break
                 
                 next_sibling = current.find_next_sibling()
-                if next_sibling and next_sibling.name == 'ol':
+                if next_sibling is not None and next_sibling.name == 'ol':
                     ol = next_sibling
                     break
                     
                 ol_in_children = current.find('ol')
-                if ol_in_children:
+                if ol_in_children is not None:
                     ol = ol_in_children
                     break
                     
                 current = current.find_next()
             
-            if not ol or id(ol) in processed_lists:
+            if ol is None or id(ol) in processed_lists:
                 continue
                 
             heading = _find_closest_heading(marker.parent)
             
             intro = []
-            if heading:
+            if heading is not None:
                 current = heading.find_next()
                 while current and current != marker.parent and current != ol:
                     if current.name not in ('script', 'style'):
@@ -342,7 +342,7 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
             
             # Find introduction elements
             intro = []
-            if heading:
+            if heading is not None:
                 current = heading.find_next()
                 while current and current != ol:
                     if current.name not in ('script', 'style'):
@@ -364,7 +364,7 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
                     break
             
             # Add to procedures if it looks like a procedure
-            if heading or marker or prerequisites:
+            if heading is not None or marker is not None or prerequisites is not None:
                 procedures.append({
                     'heading': heading,
                     'intro': intro,
@@ -392,7 +392,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
     Returns:
         The closest heading, or None if not found.
     """
-    if not element:
+    if element is None:
         return None
         
     # Check previous siblings
@@ -407,7 +407,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
             return current
     
     # Check parent's previous siblings
-    if element.parent:
+    if element.parent is not None:
         return _find_closest_heading(element.parent)
         
     return None
@@ -443,7 +443,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
             processed_tags.add(id(pre))
             
             # Skip if this pre tag is inside a code tag that we'll process later
-            if pre.parent and pre.parent.name == 'code' and pre.parent in code_tags:
+            if pre.parent is not None and pre.parent.name == 'code' and pre.parent in code_tags:
                 continue
                 
             # Find the previous paragraph for context
@@ -467,7 +467,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
             processed_tags.add(id(code))
             
             # Skip if this code tag is inside a pre tag that we've already processed
-            if code.parent and code.parent.name == 'pre' and id(code.parent) in processed_tags:
+            if code.parent is not None and code.parent.name == 'pre' and id(code.parent) in processed_tags:
                 continue
                 
             # Find the previous paragraph for context
@@ -510,7 +510,7 @@ def identify_tables(soup: BeautifulSoup) -> list[dict]:
             if tag.name == 'rh-table':
                 # Look for nested table
                 nested_table = tag.find('table')
-                if nested_table:
+                if nested_table is not None:
                     expanded_tables.append(nested_table)
                 else:
                     expanded_tables.append(tag)
@@ -534,7 +534,7 @@ def identify_tables(soup: BeautifulSoup) -> list[dict]:
             rows = []
             try:
                 # Get rows not in header
-                if header:
+                if header is not None:
                     header_rows = set(id(row) for row in header.find_all('tr'))
                     all_rows = table.find_all('tr', limit=MAX_TABLE_ROWS)
                     rows = [row for row in all_rows if id(row) not in header_rows]
diff --git a/scripts/html_chunking/tokenizer.py b/scripts/html_chunking/tokenizer.py
@@ -93,7 +93,7 @@ def count_tokens(self, text: str) -> int:
                 tokens = self.tokenizer(text)
                 return len(tokens)
         except Exception as e:
-            if "sequence length is longer than the specified maximum" in str(e) and self.hf_tokenizer:
+            if "sequence length is longer than the specified maximum" in str(e) and self.hf_tokenizer is not None:
                 warnings.warn(f"Token counting using full text failed: {e}. Using manual chunking approach.")
                 words = re.findall(r'\b\w+\b|[^\w\s]', text)
                 chunks = [' '.join(words[i:i+WORDS_PER_BATCH]) for i in range(0, len(words), WORDS_PER_BATCH)]
diff --git a/scripts/html_embeddings/chunk_html.py b/scripts/html_embeddings/chunk_html.py
@@ -88,7 +88,7 @@ def chunk_html_documents(
             doc_specific_output_dir = output_dir / doc_name
 
             # Construct the source URL, which will be passed to the chunker.
-            if doc_url:
+            if doc_url is not None:
                 source_url = doc_url
             else:
                 source_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{product_version}/html-single/{doc_name}/"
diff --git a/scripts/html_embeddings/download_docs.py b/scripts/html_embeddings/download_docs.py
@@ -49,7 +49,7 @@ def download_documentation(
     """
     logger = logging.getLogger(__name__)
 
-    if doc_url:
+    if doc_url is not None:
         base_url = doc_url
     elif specific_doc:
         base_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{version}/html-single/{specific_doc}"
diff --git a/scripts/html_embeddings/strip_html.py b/scripts/html_embeddings/strip_html.py
@@ -47,7 +47,7 @@ def strip_html_content(
         # This logic is now more direct to avoid path ambiguities.
         # It iterates through found files and constructs a precise output path.
         for input_file in html_files:
-            if exclusion_list and str(input_file) in exclusion_list:
+            if exclusion_list is not None and str(input_file) in exclusion_list:
                 logger.debug("Skipping excluded file: %s", input_file)
                 continue
 
diff --git a/scripts/html_embeddings/utils.py b/scripts/html_embeddings/utils.py
@@ -89,18 +89,18 @@ def create_directory_structure(
             product_version = ""
 
     product['slug'] = product_slug
-    product['version'] = product_version if product_version else "latest"
+    product['version'] = product_version if product_version is not None else "latest"
 
     cache_path = Path(cache_dir)
     output_path = Path(output_dir)
 
-    base_cache_path = cache_path / product_slug / product_version if product_version else cache_path / product_slug
+    base_cache_path = cache_path / product_slug / product_version if product_version is not None else cache_path / product_slug
 
     downloads_dir = base_cache_path / "downloads"
     stripped_dir = base_cache_path / "stripped"
     chunks_dir = base_cache_path / "chunks"
 
-    if specific_doc:
+    if specific_doc is not None:
         downloads_dir = downloads_dir / specific_doc
         stripped_dir = stripped_dir / specific_doc
 
@@ -169,19 +169,19 @@ def get_cache_info(
 ) -> dict[str, int]:
     """Get information about cached files."""
     base_path = cache_dir / "downloads" / version
-    if specific_doc:
+    if specific_doc is not None:
         base_path = base_path / specific_doc
 
     downloads_count = get_file_count(base_path, "*.html")
 
     base_path = cache_dir / "stripped" / version
-    if specific_doc:
+    if specific_doc is not None:
         base_path = base_path / specific_doc
 
     stripped_count = get_file_count(base_path, "*.html")
 
     base_path = cache_dir / "chunks" / version
-    if specific_doc:
+    if specific_doc is not None:
         base_path = base_path / specific_doc
 
     chunks_count = get_file_count(base_path, "*.json")