Skip to content

Commit e3efd1e

Browse files
authored
test(text-splitters): capture beta warnings (#33113)
1 parent d6769cf commit e3efd1e

File tree

1 file changed

+109
-88
lines changed

1 file changed

+109
-88
lines changed

libs/text-splitters/tests/unit_tests/test_text_splitters.py

Lines changed: 109 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import pytest
1111
from bs4 import Tag
12+
from langchain_core._api import suppress_langchain_beta_warning
1213
from langchain_core.documents import Document
1314

1415
from langchain_text_splitters import (
@@ -3280,11 +3281,12 @@ def test_html_splitter_with_custom_extractor() -> None:
32803281
<p>This is an iframe:</p>
32813282
<iframe src="http://example.com"></iframe>
32823283
"""
3283-
splitter = HTMLSemanticPreservingSplitter(
3284-
headers_to_split_on=[("h1", "Header 1")],
3285-
custom_handlers={"iframe": custom_iframe_extractor},
3286-
max_chunk_size=1000,
3287-
)
3284+
with suppress_langchain_beta_warning():
3285+
splitter = HTMLSemanticPreservingSplitter(
3286+
headers_to_split_on=[("h1", "Header 1")],
3287+
custom_handlers={"iframe": custom_iframe_extractor},
3288+
max_chunk_size=1000,
3289+
)
32883290
documents = splitter.split_text(html_content)
32893291

32903292
expected = [
@@ -3305,11 +3307,12 @@ def test_html_splitter_with_href_links() -> None:
33053307
<h1>Section 1</h1>
33063308
<p>This is a link to <a href="http://example.com">example.com</a></p>
33073309
"""
3308-
splitter = HTMLSemanticPreservingSplitter(
3309-
headers_to_split_on=[("h1", "Header 1")],
3310-
preserve_links=True,
3311-
max_chunk_size=1000,
3312-
)
3310+
with suppress_langchain_beta_warning():
3311+
splitter = HTMLSemanticPreservingSplitter(
3312+
headers_to_split_on=[("h1", "Header 1")],
3313+
preserve_links=True,
3314+
max_chunk_size=1000,
3315+
)
33133316
documents = splitter.split_text(html_content)
33143317

33153318
expected = [
@@ -3334,9 +3337,10 @@ def test_html_splitter_with_nested_elements() -> None:
33343337
</div>
33353338
</div>
33363339
"""
3337-
splitter = HTMLSemanticPreservingSplitter(
3338-
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
3339-
)
3340+
with suppress_langchain_beta_warning():
3341+
splitter = HTMLSemanticPreservingSplitter(
3342+
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
3343+
)
33403344
documents = splitter.split_text(html_content)
33413345

33423346
expected = [
@@ -3367,11 +3371,12 @@ def test_html_splitter_with_preserved_elements() -> None:
33673371
<li>Item 2</li>
33683372
</ul>
33693373
"""
3370-
splitter = HTMLSemanticPreservingSplitter(
3371-
headers_to_split_on=[("h1", "Header 1")],
3372-
elements_to_preserve=["table", "ul"],
3373-
max_chunk_size=50, # Deliberately low to test preservation
3374-
)
3374+
with suppress_langchain_beta_warning():
3375+
splitter = HTMLSemanticPreservingSplitter(
3376+
headers_to_split_on=[("h1", "Header 1")],
3377+
elements_to_preserve=["table", "ul"],
3378+
max_chunk_size=50, # Deliberately low to test preservation
3379+
)
33753380
documents = splitter.split_text(html_content)
33763381

33773382
expected = [
@@ -3393,9 +3398,10 @@ def test_html_splitter_with_no_further_splits() -> None:
33933398
<h1>Section 2</h1>
33943399
<p>More content here.</p>
33953400
"""
3396-
splitter = HTMLSemanticPreservingSplitter(
3397-
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
3398-
)
3401+
with suppress_langchain_beta_warning():
3402+
splitter = HTMLSemanticPreservingSplitter(
3403+
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=1000
3404+
)
33993405
documents = splitter.split_text(html_content)
34003406

34013407
expected = [
@@ -3414,9 +3420,10 @@ def test_html_splitter_with_small_chunk_size() -> None:
34143420
<p>This is some long text that should be split into multiple chunks due to the
34153421
small chunk size.</p>
34163422
"""
3417-
splitter = HTMLSemanticPreservingSplitter(
3418-
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=20, chunk_overlap=5
3419-
)
3423+
with suppress_langchain_beta_warning():
3424+
splitter = HTMLSemanticPreservingSplitter(
3425+
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=20, chunk_overlap=5
3426+
)
34203427
documents = splitter.split_text(html_content)
34213428

34223429
expected = [
@@ -3441,11 +3448,12 @@ def test_html_splitter_with_denylist_tags() -> None:
34413448
<p>This paragraph should be kept.</p>
34423449
<span>This span should be removed.</span>
34433450
"""
3444-
splitter = HTMLSemanticPreservingSplitter(
3445-
headers_to_split_on=[("h1", "Header 1")],
3446-
denylist_tags=["span"],
3447-
max_chunk_size=1000,
3448-
)
3451+
with suppress_langchain_beta_warning():
3452+
splitter = HTMLSemanticPreservingSplitter(
3453+
headers_to_split_on=[("h1", "Header 1")],
3454+
denylist_tags=["span"],
3455+
max_chunk_size=1000,
3456+
)
34493457
documents = splitter.split_text(html_content)
34503458

34513459
expected = [
@@ -3465,11 +3473,12 @@ def test_html_splitter_with_external_metadata() -> None:
34653473
<h1>Section 1</h1>
34663474
<p>This is some content.</p>
34673475
"""
3468-
splitter = HTMLSemanticPreservingSplitter(
3469-
headers_to_split_on=[("h1", "Header 1")],
3470-
external_metadata={"source": "example.com"},
3471-
max_chunk_size=1000,
3472-
)
3476+
with suppress_langchain_beta_warning():
3477+
splitter = HTMLSemanticPreservingSplitter(
3478+
headers_to_split_on=[("h1", "Header 1")],
3479+
external_metadata={"source": "example.com"},
3480+
max_chunk_size=1000,
3481+
)
34733482
documents = splitter.split_text(html_content)
34743483

34753484
expected = [
@@ -3489,11 +3498,12 @@ def test_html_splitter_with_text_normalization() -> None:
34893498
<h1>Section 1</h1>
34903499
<p>This is some TEXT that should be normalized!</p>
34913500
"""
3492-
splitter = HTMLSemanticPreservingSplitter(
3493-
headers_to_split_on=[("h1", "Header 1")],
3494-
normalize_text=True,
3495-
max_chunk_size=1000,
3496-
)
3501+
with suppress_langchain_beta_warning():
3502+
splitter = HTMLSemanticPreservingSplitter(
3503+
headers_to_split_on=[("h1", "Header 1")],
3504+
normalize_text=True,
3505+
max_chunk_size=1000,
3506+
)
34973507
documents = splitter.split_text(html_content)
34983508

34993509
expected = [
@@ -3515,11 +3525,12 @@ def test_html_splitter_with_allowlist_tags() -> None:
35153525
<span>This span should be kept.</span>
35163526
<div>This div should be removed.</div>
35173527
"""
3518-
splitter = HTMLSemanticPreservingSplitter(
3519-
headers_to_split_on=[("h1", "Header 1")],
3520-
allowlist_tags=["p", "span"],
3521-
max_chunk_size=1000,
3522-
)
3528+
with suppress_langchain_beta_warning():
3529+
splitter = HTMLSemanticPreservingSplitter(
3530+
headers_to_split_on=[("h1", "Header 1")],
3531+
allowlist_tags=["p", "span"],
3532+
max_chunk_size=1000,
3533+
)
35233534
documents = splitter.split_text(html_content)
35243535

35253536
expected = [
@@ -3548,12 +3559,13 @@ def test_html_splitter_with_mixed_preserve_and_filter() -> None:
35483559
<p>This paragraph should be kept.</p>
35493560
<span>This span should be removed.</span>
35503561
"""
3551-
splitter = HTMLSemanticPreservingSplitter(
3552-
headers_to_split_on=[("h1", "Header 1")],
3553-
elements_to_preserve=["table"],
3554-
denylist_tags=["span"],
3555-
max_chunk_size=1000,
3556-
)
3562+
with suppress_langchain_beta_warning():
3563+
splitter = HTMLSemanticPreservingSplitter(
3564+
headers_to_split_on=[("h1", "Header 1")],
3565+
elements_to_preserve=["table"],
3566+
denylist_tags=["span"],
3567+
max_chunk_size=1000,
3568+
)
35573569
documents = splitter.split_text(html_content)
35583570

35593571
expected = [
@@ -3574,10 +3586,11 @@ def test_html_splitter_with_no_headers() -> None:
35743586
<p>This is content without any headers.</p>
35753587
<p>It should still produce a valid document.</p>
35763588
"""
3577-
splitter = HTMLSemanticPreservingSplitter(
3578-
headers_to_split_on=[],
3579-
max_chunk_size=1000,
3580-
)
3589+
with suppress_langchain_beta_warning():
3590+
splitter = HTMLSemanticPreservingSplitter(
3591+
headers_to_split_on=[],
3592+
max_chunk_size=1000,
3593+
)
35813594
documents = splitter.split_text(html_content)
35823595

35833596
expected = [
@@ -3607,13 +3620,14 @@ def test_html_splitter_with_media_preservation() -> None:
36073620
<p>This is audio:</p>
36083621
<audio src="http://example.com/audio.mp3"></audio>
36093622
"""
3610-
splitter = HTMLSemanticPreservingSplitter(
3611-
headers_to_split_on=[("h1", "Header 1")],
3612-
preserve_images=True,
3613-
preserve_videos=True,
3614-
preserve_audio=True,
3615-
max_chunk_size=1000,
3616-
)
3623+
with suppress_langchain_beta_warning():
3624+
splitter = HTMLSemanticPreservingSplitter(
3625+
headers_to_split_on=[("h1", "Header 1")],
3626+
preserve_images=True,
3627+
preserve_videos=True,
3628+
preserve_audio=True,
3629+
max_chunk_size=1000,
3630+
)
36173631
documents = splitter.split_text(html_content)
36183632

36193633
expected = [
@@ -3638,12 +3652,13 @@ def test_html_splitter_keep_separator_true() -> None:
36383652
<h1>Section 1</h1>
36393653
<p>This is some text. This is some other text.</p>
36403654
"""
3641-
splitter = HTMLSemanticPreservingSplitter(
3642-
headers_to_split_on=[("h1", "Header 1")],
3643-
max_chunk_size=10,
3644-
separators=[". "],
3645-
keep_separator=True,
3646-
)
3655+
with suppress_langchain_beta_warning():
3656+
splitter = HTMLSemanticPreservingSplitter(
3657+
headers_to_split_on=[("h1", "Header 1")],
3658+
max_chunk_size=10,
3659+
separators=[". "],
3660+
keep_separator=True,
3661+
)
36473662
documents = splitter.split_text(html_content)
36483663

36493664
expected = [
@@ -3667,12 +3682,13 @@ def test_html_splitter_keep_separator_false() -> None:
36673682
<h1>Section 1</h1>
36683683
<p>This is some text. This is some other text.</p>
36693684
"""
3670-
splitter = HTMLSemanticPreservingSplitter(
3671-
headers_to_split_on=[("h1", "Header 1")],
3672-
max_chunk_size=10,
3673-
separators=[". "],
3674-
keep_separator=False,
3675-
)
3685+
with suppress_langchain_beta_warning():
3686+
splitter = HTMLSemanticPreservingSplitter(
3687+
headers_to_split_on=[("h1", "Header 1")],
3688+
max_chunk_size=10,
3689+
separators=[". "],
3690+
keep_separator=False,
3691+
)
36763692
documents = splitter.split_text(html_content)
36773693

36783694
expected = [
@@ -3696,12 +3712,13 @@ def test_html_splitter_keep_separator_start() -> None:
36963712
<h1>Section 1</h1>
36973713
<p>This is some text. This is some other text.</p>
36983714
"""
3699-
splitter = HTMLSemanticPreservingSplitter(
3700-
headers_to_split_on=[("h1", "Header 1")],
3701-
max_chunk_size=10,
3702-
separators=[". "],
3703-
keep_separator="start",
3704-
)
3715+
with suppress_langchain_beta_warning():
3716+
splitter = HTMLSemanticPreservingSplitter(
3717+
headers_to_split_on=[("h1", "Header 1")],
3718+
max_chunk_size=10,
3719+
separators=[". "],
3720+
keep_separator="start",
3721+
)
37053722
documents = splitter.split_text(html_content)
37063723

37073724
expected = [
@@ -3725,12 +3742,13 @@ def test_html_splitter_keep_separator_end() -> None:
37253742
<h1>Section 1</h1>
37263743
<p>This is some text. This is some other text.</p>
37273744
"""
3728-
splitter = HTMLSemanticPreservingSplitter(
3729-
headers_to_split_on=[("h1", "Header 1")],
3730-
max_chunk_size=10,
3731-
separators=[". "],
3732-
keep_separator="end",
3733-
)
3745+
with suppress_langchain_beta_warning():
3746+
splitter = HTMLSemanticPreservingSplitter(
3747+
headers_to_split_on=[("h1", "Header 1")],
3748+
max_chunk_size=10,
3749+
separators=[". "],
3750+
keep_separator="end",
3751+
)
37343752
documents = splitter.split_text(html_content)
37353753

37363754
expected = [
@@ -3754,9 +3772,12 @@ def test_html_splitter_keep_separator_default() -> None:
37543772
<h1>Section 1</h1>
37553773
<p>This is some text. This is some other text.</p>
37563774
"""
3757-
splitter = HTMLSemanticPreservingSplitter(
3758-
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
3759-
)
3775+
with suppress_langchain_beta_warning():
3776+
splitter = HTMLSemanticPreservingSplitter(
3777+
headers_to_split_on=[("h1", "Header 1")],
3778+
max_chunk_size=10,
3779+
separators=[". "],
3780+
)
37603781
documents = splitter.split_text(html_content)
37613782

37623783
expected = [

0 commit comments

Comments
 (0)