Skip to content

Commit 205dd78

Browse files
committed
Implement feedback across all chunking codebase
1 parent 49e0a37 commit 205dd78

File tree

9 files changed

+42
-43
lines changed

9 files changed

+42
-43
lines changed

scripts/doc_downloader/downloader.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -
4545
parsed_url = urlparse(url)
4646
path = parsed_url.path
4747

48-
if base_url:
48+
if base_url is not None:
4949
parsed_base_url = urlparse(base_url)
5050
base_path = parsed_base_url.path
5151
# If base_url is a directory-like path, ensure it ends with a slash for clean prefix removal
@@ -57,7 +57,7 @@ def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -
5757

5858
path = path.lstrip("/")
5959

60-
if not path or path.endswith('/'):
60+
if path == "" or path.endswith('/'):
6161
path = path + "index.html"
6262

6363
local_path = output_dir / path
@@ -163,7 +163,7 @@ async def download_page(
163163
logger.error("Error downloading %s: %s", url, e)
164164

165165
if attempt < max_retries - 1:
166-
await asyncio.sleep(1)
166+
await asyncio.sleep(2 ** attempt)
167167

168168
record_download(db_path, url, str(local_path), status="failed")
169169
return url, False
@@ -227,7 +227,6 @@ async def run_downloader(
227227
concurrency: int,
228228
force: bool,
229229
max_retries: int,
230-
**kwargs, # Absorb unused arguments
231230
) -> tuple[bool, bool, float]:
232231
"""Run the complete download process."""
233232
output_dir_path = Path(output_dir)

scripts/html_chunking/chunker.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
4141
def get_document_title(soup: BeautifulSoup) -> str:
4242
"""Extracts the document title from the <title> tag."""
4343
title_tag = soup.find('title')
44-
return title_tag.get_text(strip=True) if title_tag else "Untitled"
44+
return title_tag.get_text(strip=True) if title_tag is not None else "Untitled"
4545

4646

4747
def chunk_html(
@@ -212,7 +212,7 @@ def _split_element_by_children_no_grouping(element: Tag, options: ChunkingOption
212212

213213
def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[str]:
214214
dl = div_element.find('dl')
215-
if not dl: return _split_element_by_children(div_element, options)
215+
if dl is None: return _split_element_by_children(div_element, options)
216216
chunks, current_chunk_pairs_html, current_tokens = [], [], 0
217217
pairs, children, i = [], list(dl.children), 0
218218
while i < len(children):
@@ -237,11 +237,11 @@ def _split_definition_list(div_element: Tag, options: ChunkingOptions) -> list[s
237237
def _split_table(table: Tag, options: ChunkingOptions) -> list[str]:
238238
chunks, header = [], table.find('thead')
239239
rows = table.find_all('tr')
240-
header_rows_ids = set(id(r) for r in header.find_all('tr')) if header else set()
240+
header_rows_ids = set(id(r) for r in header.find_all('tr')) if header is not None else set()
241241
body_rows = [row for row in rows if id(row) not in header_rows_ids]
242242
table_attrs = " ".join([f'{k}="{v}"' for k, v in table.attrs.items()])
243243
table_open, table_close = f"<table {table_attrs}>", "</table>"
244-
header_html = str(header) if header else ""
244+
header_html = str(header) if header is not None else ""
245245
base_tokens = count_html_tokens(table_open + header_html + table_close, options.count_tag_tokens)
246246
current_chunk_rows, current_tokens = [], base_tokens
247247
for row in body_rows:
@@ -285,7 +285,7 @@ def _split_list(list_element: Tag, options: ChunkingOptions) -> list[str]:
285285
if item_tokens + base_tokens > options.max_token_limit:
286286
if current_chunk_items: chunks.append(list_open + "".join(current_chunk_items) + list_close)
287287
item_soup = BeautifulSoup(item_html, 'html.parser').li
288-
if item_soup:
288+
if item_soup is not None:
289289
sub_chunks = _split_element_by_children(item_soup, options)
290290
for sub_chunk in sub_chunks: chunks.append(list_open + f"<li>{sub_chunk}</li>" + list_close)
291291
else: chunks.append(list_open + item_html + list_close)

scripts/html_chunking/html-stripper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bo
4848
state = rh_alert.get('state', 'note')
4949
content_div = rh_alert.find('div', slot=None) or rh_alert.find('p')
5050

51-
if content_div:
51+
if content_div is not None:
5252
new_div = soup.new_tag('div')
5353
new_div['class'] = f'alert alert-{state}'
5454
new_div.extend(content_div.contents)

scripts/html_chunking/parser.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def add_child(self, child: 'HtmlSection') -> None:
4141

4242
def get_heading_text(self) -> str:
4343
"""Get the text of the heading for this section."""
44-
if self.heading_tag:
44+
if self.heading_tag is not None:
4545
try:
4646
return self.heading_tag.get_text(strip=True)
4747
except Exception:
@@ -56,7 +56,7 @@ def get_html(self) -> str:
5656

5757
try:
5858
result = []
59-
if self.heading_tag:
59+
if self.heading_tag is not None:
6060
result.append(str(self.heading_tag))
6161

6262
for item in self.content:
@@ -69,7 +69,7 @@ def get_html(self) -> str:
6969
return self.html
7070
except Exception as e:
7171
# Fallback in case of error
72-
if self.heading_tag:
72+
if self.heading_tag is not None:
7373
return str(self.heading_tag)
7474
return ""
7575

@@ -97,7 +97,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
9797
# First pass: identify all headings
9898
all_headings = []
9999
for element in body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
100-
if element.name and re.match(r'h[1-6]$', element.name):
100+
if element.name is not None and re.match(r'h[1-6]$', element.name):
101101
level = int(element.name[1])
102102
all_headings.append((element, level))
103103

@@ -144,16 +144,16 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
144144
current_section = root_section
145145

146146
for element in body.children:
147-
if not element or (isinstance(element, str) and not element.strip()):
147+
if element is None or (isinstance(element, str) and not element.strip()):
148148
continue
149149

150150
is_section_start = False
151151
new_level = None
152152

153-
if isinstance(element, Tag) and element.name and re.match(r'h[1-6]$', element.name):
153+
if isinstance(element, Tag) and element.name is not None and re.match(r'h[1-6]$', element.name):
154154
level = int(element.name[1])
155155
for section in _flatten_sections(root_section):
156-
if section.heading_tag and section.heading_tag == element:
156+
if section.heading_tag is not None and section.heading_tag == element:
157157
is_section_start = True
158158
new_level = level
159159
current_section = section
@@ -163,7 +163,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
163163
current_section.add_content(element)
164164
else:
165165
for element in body.children:
166-
if element:
166+
if element is not None:
167167
root_section.add_content(element)
168168

169169
return soup, root_section
@@ -173,7 +173,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
173173
root_section = HtmlSection()
174174

175175
for element in soup.children:
176-
if element:
176+
if element is not None:
177177
root_section.add_content(element)
178178

179179
return soup, root_section
@@ -261,15 +261,15 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
261261
# Multiple ways to identify procedures
262262
procedure_markers = []
263263
for element in soup.find_all(string=lambda text: text and "Procedure" in text):
264-
if element.parent and element.parent.name not in ('script', 'style'):
264+
if element.parent is not None and element.parent.name not in ('script', 'style'):
265265
procedure_markers.append(element)
266266

267267
ordered_lists = soup.find_all('ol')
268268

269269
processed_lists = set()
270270

271271
for marker in procedure_markers:
272-
if not marker or not marker.parent:
272+
if marker is None or marker.parent is None:
273273
continue
274274

275275
ol = None
@@ -283,24 +283,24 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
283283
break
284284

285285
next_sibling = current.find_next_sibling()
286-
if next_sibling and next_sibling.name == 'ol':
286+
if next_sibling is not None and next_sibling.name == 'ol':
287287
ol = next_sibling
288288
break
289289

290290
ol_in_children = current.find('ol')
291-
if ol_in_children:
291+
if ol_in_children is not None:
292292
ol = ol_in_children
293293
break
294294

295295
current = current.find_next()
296296

297-
if not ol or id(ol) in processed_lists:
297+
if ol is None or id(ol) in processed_lists:
298298
continue
299299

300300
heading = _find_closest_heading(marker.parent)
301301

302302
intro = []
303-
if heading:
303+
if heading is not None:
304304
current = heading.find_next()
305305
while current and current != marker.parent and current != ol:
306306
if current.name not in ('script', 'style'):
@@ -342,7 +342,7 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
342342

343343
# Find introduction elements
344344
intro = []
345-
if heading:
345+
if heading is not None:
346346
current = heading.find_next()
347347
while current and current != ol:
348348
if current.name not in ('script', 'style'):
@@ -364,7 +364,7 @@ def identify_procedure_sections(soup: BeautifulSoup) -> list[dict]:
364364
break
365365

366366
# Add to procedures if it looks like a procedure
367-
if heading or marker or prerequisites:
367+
if heading is not None or marker is not None or prerequisites is not None:
368368
procedures.append({
369369
'heading': heading,
370370
'intro': intro,
@@ -392,7 +392,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
392392
Returns:
393393
The closest heading, or None if not found.
394394
"""
395-
if not element:
395+
if element is None:
396396
return None
397397

398398
# Check previous siblings
@@ -407,7 +407,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
407407
return current
408408

409409
# Check parent's previous siblings
410-
if element.parent:
410+
if element.parent is not None:
411411
return _find_closest_heading(element.parent)
412412

413413
return None
@@ -443,7 +443,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
443443
processed_tags.add(id(pre))
444444

445445
# Skip if this pre tag is inside a code tag that we'll process later
446-
if pre.parent and pre.parent.name == 'code' and pre.parent in code_tags:
446+
if pre.parent is not None and pre.parent.name == 'code' and pre.parent in code_tags:
447447
continue
448448

449449
# Find the previous paragraph for context
@@ -467,7 +467,7 @@ def identify_code_blocks(soup: BeautifulSoup) -> list[dict]:
467467
processed_tags.add(id(code))
468468

469469
# Skip if this code tag is inside a pre tag that we've already processed
470-
if code.parent and code.parent.name == 'pre' and id(code.parent) in processed_tags:
470+
if code.parent is not None and code.parent.name == 'pre' and id(code.parent) in processed_tags:
471471
continue
472472

473473
# Find the previous paragraph for context
@@ -510,7 +510,7 @@ def identify_tables(soup: BeautifulSoup) -> list[dict]:
510510
if tag.name == 'rh-table':
511511
# Look for nested table
512512
nested_table = tag.find('table')
513-
if nested_table:
513+
if nested_table is not None:
514514
expanded_tables.append(nested_table)
515515
else:
516516
expanded_tables.append(tag)
@@ -534,7 +534,7 @@ def identify_tables(soup: BeautifulSoup) -> list[dict]:
534534
rows = []
535535
try:
536536
# Get rows not in header
537-
if header:
537+
if header is not None:
538538
header_rows = set(id(row) for row in header.find_all('tr'))
539539
all_rows = table.find_all('tr', limit=MAX_TABLE_ROWS)
540540
rows = [row for row in all_rows if id(row) not in header_rows]

scripts/html_chunking/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def count_tokens(self, text: str) -> int:
9393
tokens = self.tokenizer(text)
9494
return len(tokens)
9595
except Exception as e:
96-
if "sequence length is longer than the specified maximum" in str(e) and self.hf_tokenizer:
96+
if "sequence length is longer than the specified maximum" in str(e) and self.hf_tokenizer is not None:
9797
warnings.warn(f"Token counting using full text failed: {e}. Using manual chunking approach.")
9898
words = re.findall(r'\b\w+\b|[^\w\s]', text)
9999
chunks = [' '.join(words[i:i+WORDS_PER_BATCH]) for i in range(0, len(words), WORDS_PER_BATCH)]

scripts/html_embeddings/chunk_html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def chunk_html_documents(
8888
doc_specific_output_dir = output_dir / doc_name
8989

9090
# Construct the source URL, which will be passed to the chunker.
91-
if doc_url:
91+
if doc_url is not None:
9292
source_url = doc_url
9393
else:
9494
source_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{product_version}/html-single/{doc_name}/"

scripts/html_embeddings/download_docs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def download_documentation(
4949
"""
5050
logger = logging.getLogger(__name__)
5151

52-
if doc_url:
52+
if doc_url is not None:
5353
base_url = doc_url
5454
elif specific_doc:
5555
base_url = f"https://docs.redhat.com/en/documentation/{product_slug}/{version}/html-single/{specific_doc}"

scripts/html_embeddings/strip_html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def strip_html_content(
4747
# This logic is now more direct to avoid path ambiguities.
4848
# It iterates through found files and constructs a precise output path.
4949
for input_file in html_files:
50-
if exclusion_list and str(input_file) in exclusion_list:
50+
if exclusion_list is not None and str(input_file) in exclusion_list:
5151
logger.debug("Skipping excluded file: %s", input_file)
5252
continue
5353

scripts/html_embeddings/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,18 @@ def create_directory_structure(
8989
product_version = ""
9090

9191
product['slug'] = product_slug
92-
product['version'] = product_version if product_version else "latest"
92+
product['version'] = product_version if product_version is not None else "latest"
9393

9494
cache_path = Path(cache_dir)
9595
output_path = Path(output_dir)
9696

97-
base_cache_path = cache_path / product_slug / product_version if product_version else cache_path / product_slug
97+
base_cache_path = cache_path / product_slug / product_version if product_version is not None else cache_path / product_slug
9898

9999
downloads_dir = base_cache_path / "downloads"
100100
stripped_dir = base_cache_path / "stripped"
101101
chunks_dir = base_cache_path / "chunks"
102102

103-
if specific_doc:
103+
if specific_doc is not None:
104104
downloads_dir = downloads_dir / specific_doc
105105
stripped_dir = stripped_dir / specific_doc
106106

@@ -169,19 +169,19 @@ def get_cache_info(
169169
) -> dict[str, int]:
170170
"""Get information about cached files."""
171171
base_path = cache_dir / "downloads" / version
172-
if specific_doc:
172+
if specific_doc is not None:
173173
base_path = base_path / specific_doc
174174

175175
downloads_count = get_file_count(base_path, "*.html")
176176

177177
base_path = cache_dir / "stripped" / version
178-
if specific_doc:
178+
if specific_doc is not None:
179179
base_path = base_path / specific_doc
180180

181181
stripped_count = get_file_count(base_path, "*.html")
182182

183183
base_path = cache_dir / "chunks" / version
184-
if specific_doc:
184+
if specific_doc is not None:
185185
base_path = base_path / specific_doc
186186

187187
chunks_count = get_file_count(base_path, "*.json")

0 commit comments

Comments
 (0)