test(text-splitters): capture beta warnings (#33113)

mdrxy · web-flow · commit e3efd1e891b8 · 2025-09-25T01:30:20.000-04:00
diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -9,6 +9,7 @@
 
 import pytest
 from bs4 import Tag
+from langchain_core._api import suppress_langchain_beta_warning
 from langchain_core.documents import Document
 
 from langchain_text_splitters import (
@@ -3280,11 +3281,12 @@ def test_html_splitter_with_custom_extractor() -> None:
     <p>This is an iframe:</p>
     <iframe src="http://example.com"></iframe>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        custom_handlers={"iframe": custom_iframe_extractor},
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            custom_handlers={"iframe": custom_iframe_extractor},
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3305,11 +3307,12 @@ def test_html_splitter_with_href_links() -> None:
     <h1>Section 1</h1>
     <p>This is a link to <a href="http://example.com">example.com</a></p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        preserve_links=True,
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            preserve_links=True,
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3334,9 +3337,10 @@ def test_html_splitter_with_nested_elements() -> None:
         </div>
     </div>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3367,11 +3371,12 @@ def test_html_splitter_with_preserved_elements() -> None:
         <li>Item 2</li>
     </ul>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        elements_to_preserve=["table", "ul"],
-        max_chunk_size=50,  # Deliberately low to test preservation
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            elements_to_preserve=["table", "ul"],
+            max_chunk_size=50,  # Deliberately low to test preservation
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3393,9 +3398,10 @@ def test_html_splitter_with_no_further_splits() -> None:
     <h1>Section 2</h1>
     <p>More content here.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3414,9 +3420,10 @@ def test_html_splitter_with_small_chunk_size() -> None:
     <p>This is some long text that should be split into multiple chunks due to the
     small chunk size.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")], max_chunk_size=20, chunk_overlap=5
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")], max_chunk_size=20, chunk_overlap=5
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3441,11 +3448,12 @@ def test_html_splitter_with_denylist_tags() -> None:
     <p>This paragraph should be kept.</p>
     <span>This span should be removed.</span>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        denylist_tags=["span"],
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            denylist_tags=["span"],
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3465,11 +3473,12 @@ def test_html_splitter_with_external_metadata() -> None:
     <h1>Section 1</h1>
     <p>This is some content.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        external_metadata={"source": "example.com"},
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            external_metadata={"source": "example.com"},
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3489,11 +3498,12 @@ def test_html_splitter_with_text_normalization() -> None:
     <h1>Section 1</h1>
     <p>This is some TEXT that should be normalized!</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        normalize_text=True,
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            normalize_text=True,
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3515,11 +3525,12 @@ def test_html_splitter_with_allowlist_tags() -> None:
     <span>This span should be kept.</span>
     <div>This div should be removed.</div>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        allowlist_tags=["p", "span"],
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            allowlist_tags=["p", "span"],
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3548,12 +3559,13 @@ def test_html_splitter_with_mixed_preserve_and_filter() -> None:
     <p>This paragraph should be kept.</p>
     <span>This span should be removed.</span>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        elements_to_preserve=["table"],
-        denylist_tags=["span"],
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            elements_to_preserve=["table"],
+            denylist_tags=["span"],
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3574,10 +3586,11 @@ def test_html_splitter_with_no_headers() -> None:
     <p>This is content without any headers.</p>
     <p>It should still produce a valid document.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[],
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[],
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3607,13 +3620,14 @@ def test_html_splitter_with_media_preservation() -> None:
     <p>This is audio:</p>
     <audio src="http://example.com/audio.mp3"></audio>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        preserve_images=True,
-        preserve_videos=True,
-        preserve_audio=True,
-        max_chunk_size=1000,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            preserve_images=True,
+            preserve_videos=True,
+            preserve_audio=True,
+            max_chunk_size=1000,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3638,12 +3652,13 @@ def test_html_splitter_keep_separator_true() -> None:
     <h1>Section 1</h1>
     <p>This is some text. This is some other text.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        max_chunk_size=10,
-        separators=[". "],
-        keep_separator=True,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            max_chunk_size=10,
+            separators=[". "],
+            keep_separator=True,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3667,12 +3682,13 @@ def test_html_splitter_keep_separator_false() -> None:
     <h1>Section 1</h1>
     <p>This is some text. This is some other text.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        max_chunk_size=10,
-        separators=[". "],
-        keep_separator=False,
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            max_chunk_size=10,
+            separators=[". "],
+            keep_separator=False,
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3696,12 +3712,13 @@ def test_html_splitter_keep_separator_start() -> None:
     <h1>Section 1</h1>
     <p>This is some text. This is some other text.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        max_chunk_size=10,
-        separators=[". "],
-        keep_separator="start",
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            max_chunk_size=10,
+            separators=[". "],
+            keep_separator="start",
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3725,12 +3742,13 @@ def test_html_splitter_keep_separator_end() -> None:
     <h1>Section 1</h1>
     <p>This is some text. This is some other text.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")],
-        max_chunk_size=10,
-        separators=[". "],
-        keep_separator="end",
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            max_chunk_size=10,
+            separators=[". "],
+            keep_separator="end",
+        )
     documents = splitter.split_text(html_content)
 
     expected = [
@@ -3754,9 +3772,12 @@ def test_html_splitter_keep_separator_default() -> None:
     <h1>Section 1</h1>
     <p>This is some text. This is some other text.</p>
     """
-    splitter = HTMLSemanticPreservingSplitter(
-        headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
-    )
+    with suppress_langchain_beta_warning():
+        splitter = HTMLSemanticPreservingSplitter(
+            headers_to_split_on=[("h1", "Header 1")],
+            max_chunk_size=10,
+            separators=[". "],
+        )
     documents = splitter.split_text(html_content)
 
     expected = [