(text-splitters): Small Fix in _process_html for HTMLSemanticPreservingSplitter to properly extract the metadata. (#29215)

keenborder786 · web-flow · commit 288613d36180 · 2025-01-15T10:18:06.000-05:00
- **Description:** Include `main` in the list of elements whose child elements needs to be processed for splitting the HTML. - **Issue:** #29184
diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
@@ -696,7 +696,7 @@ def _process_element(
             placeholder_count: int,
         ) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:
             for elem in element:
-                if elem.name.lower() in ["html", "body", "div"]:
+                if elem.name.lower() in ["html", "body", "div", "main"]:
                     children = elem.find_all(recursive=False)
                     (
                         documents,

Original file line number	Diff line number	Diff line change
`@@ -696,7 +696,7 @@ def _process_element(`
`696`	`696`	`placeholder_count: int,`
`697`	`697`	`) -> Tuple[List[Document], Dict[str, str], List[str], Dict[str, str], int]:`
`698`	`698`	`for elem in element:`
`699`		`- if elem.name.lower() in ["html", "body", "div"]:`
	`699`	`+ if elem.name.lower() in ["html", "body", "div", "main"]:`
`700`	`700`	`children = elem.find_all(recursive=False)`
`701`	`701`	`(`
`702`	`702`	`documents,`