9
9
10
10
import pytest
11
11
from bs4 import Tag
12
+ from langchain_core ._api import suppress_langchain_beta_warning
12
13
from langchain_core .documents import Document
13
14
14
15
from langchain_text_splitters import (
@@ -3280,11 +3281,12 @@ def test_html_splitter_with_custom_extractor() -> None:
3280
3281
<p>This is an iframe:</p>
3281
3282
<iframe src="http://example.com"></iframe>
3282
3283
"""
3283
- splitter = HTMLSemanticPreservingSplitter (
3284
- headers_to_split_on = [("h1" , "Header 1" )],
3285
- custom_handlers = {"iframe" : custom_iframe_extractor },
3286
- max_chunk_size = 1000 ,
3287
- )
3284
+ with suppress_langchain_beta_warning ():
3285
+ splitter = HTMLSemanticPreservingSplitter (
3286
+ headers_to_split_on = [("h1" , "Header 1" )],
3287
+ custom_handlers = {"iframe" : custom_iframe_extractor },
3288
+ max_chunk_size = 1000 ,
3289
+ )
3288
3290
documents = splitter .split_text (html_content )
3289
3291
3290
3292
expected = [
@@ -3305,11 +3307,12 @@ def test_html_splitter_with_href_links() -> None:
3305
3307
<h1>Section 1</h1>
3306
3308
<p>This is a link to <a href="http://example.com">example.com</a></p>
3307
3309
"""
3308
- splitter = HTMLSemanticPreservingSplitter (
3309
- headers_to_split_on = [("h1" , "Header 1" )],
3310
- preserve_links = True ,
3311
- max_chunk_size = 1000 ,
3312
- )
3310
+ with suppress_langchain_beta_warning ():
3311
+ splitter = HTMLSemanticPreservingSplitter (
3312
+ headers_to_split_on = [("h1" , "Header 1" )],
3313
+ preserve_links = True ,
3314
+ max_chunk_size = 1000 ,
3315
+ )
3313
3316
documents = splitter .split_text (html_content )
3314
3317
3315
3318
expected = [
@@ -3334,9 +3337,10 @@ def test_html_splitter_with_nested_elements() -> None:
3334
3337
</div>
3335
3338
</div>
3336
3339
"""
3337
- splitter = HTMLSemanticPreservingSplitter (
3338
- headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 1000
3339
- )
3340
+ with suppress_langchain_beta_warning ():
3341
+ splitter = HTMLSemanticPreservingSplitter (
3342
+ headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 1000
3343
+ )
3340
3344
documents = splitter .split_text (html_content )
3341
3345
3342
3346
expected = [
@@ -3367,11 +3371,12 @@ def test_html_splitter_with_preserved_elements() -> None:
3367
3371
<li>Item 2</li>
3368
3372
</ul>
3369
3373
"""
3370
- splitter = HTMLSemanticPreservingSplitter (
3371
- headers_to_split_on = [("h1" , "Header 1" )],
3372
- elements_to_preserve = ["table" , "ul" ],
3373
- max_chunk_size = 50 , # Deliberately low to test preservation
3374
- )
3374
+ with suppress_langchain_beta_warning ():
3375
+ splitter = HTMLSemanticPreservingSplitter (
3376
+ headers_to_split_on = [("h1" , "Header 1" )],
3377
+ elements_to_preserve = ["table" , "ul" ],
3378
+ max_chunk_size = 50 , # Deliberately low to test preservation
3379
+ )
3375
3380
documents = splitter .split_text (html_content )
3376
3381
3377
3382
expected = [
@@ -3393,9 +3398,10 @@ def test_html_splitter_with_no_further_splits() -> None:
3393
3398
<h1>Section 2</h1>
3394
3399
<p>More content here.</p>
3395
3400
"""
3396
- splitter = HTMLSemanticPreservingSplitter (
3397
- headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 1000
3398
- )
3401
+ with suppress_langchain_beta_warning ():
3402
+ splitter = HTMLSemanticPreservingSplitter (
3403
+ headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 1000
3404
+ )
3399
3405
documents = splitter .split_text (html_content )
3400
3406
3401
3407
expected = [
@@ -3414,9 +3420,10 @@ def test_html_splitter_with_small_chunk_size() -> None:
3414
3420
<p>This is some long text that should be split into multiple chunks due to the
3415
3421
small chunk size.</p>
3416
3422
"""
3417
- splitter = HTMLSemanticPreservingSplitter (
3418
- headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 20 , chunk_overlap = 5
3419
- )
3423
+ with suppress_langchain_beta_warning ():
3424
+ splitter = HTMLSemanticPreservingSplitter (
3425
+ headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 20 , chunk_overlap = 5
3426
+ )
3420
3427
documents = splitter .split_text (html_content )
3421
3428
3422
3429
expected = [
@@ -3441,11 +3448,12 @@ def test_html_splitter_with_denylist_tags() -> None:
3441
3448
<p>This paragraph should be kept.</p>
3442
3449
<span>This span should be removed.</span>
3443
3450
"""
3444
- splitter = HTMLSemanticPreservingSplitter (
3445
- headers_to_split_on = [("h1" , "Header 1" )],
3446
- denylist_tags = ["span" ],
3447
- max_chunk_size = 1000 ,
3448
- )
3451
+ with suppress_langchain_beta_warning ():
3452
+ splitter = HTMLSemanticPreservingSplitter (
3453
+ headers_to_split_on = [("h1" , "Header 1" )],
3454
+ denylist_tags = ["span" ],
3455
+ max_chunk_size = 1000 ,
3456
+ )
3449
3457
documents = splitter .split_text (html_content )
3450
3458
3451
3459
expected = [
@@ -3465,11 +3473,12 @@ def test_html_splitter_with_external_metadata() -> None:
3465
3473
<h1>Section 1</h1>
3466
3474
<p>This is some content.</p>
3467
3475
"""
3468
- splitter = HTMLSemanticPreservingSplitter (
3469
- headers_to_split_on = [("h1" , "Header 1" )],
3470
- external_metadata = {"source" : "example.com" },
3471
- max_chunk_size = 1000 ,
3472
- )
3476
+ with suppress_langchain_beta_warning ():
3477
+ splitter = HTMLSemanticPreservingSplitter (
3478
+ headers_to_split_on = [("h1" , "Header 1" )],
3479
+ external_metadata = {"source" : "example.com" },
3480
+ max_chunk_size = 1000 ,
3481
+ )
3473
3482
documents = splitter .split_text (html_content )
3474
3483
3475
3484
expected = [
@@ -3489,11 +3498,12 @@ def test_html_splitter_with_text_normalization() -> None:
3489
3498
<h1>Section 1</h1>
3490
3499
<p>This is some TEXT that should be normalized!</p>
3491
3500
"""
3492
- splitter = HTMLSemanticPreservingSplitter (
3493
- headers_to_split_on = [("h1" , "Header 1" )],
3494
- normalize_text = True ,
3495
- max_chunk_size = 1000 ,
3496
- )
3501
+ with suppress_langchain_beta_warning ():
3502
+ splitter = HTMLSemanticPreservingSplitter (
3503
+ headers_to_split_on = [("h1" , "Header 1" )],
3504
+ normalize_text = True ,
3505
+ max_chunk_size = 1000 ,
3506
+ )
3497
3507
documents = splitter .split_text (html_content )
3498
3508
3499
3509
expected = [
@@ -3515,11 +3525,12 @@ def test_html_splitter_with_allowlist_tags() -> None:
3515
3525
<span>This span should be kept.</span>
3516
3526
<div>This div should be removed.</div>
3517
3527
"""
3518
- splitter = HTMLSemanticPreservingSplitter (
3519
- headers_to_split_on = [("h1" , "Header 1" )],
3520
- allowlist_tags = ["p" , "span" ],
3521
- max_chunk_size = 1000 ,
3522
- )
3528
+ with suppress_langchain_beta_warning ():
3529
+ splitter = HTMLSemanticPreservingSplitter (
3530
+ headers_to_split_on = [("h1" , "Header 1" )],
3531
+ allowlist_tags = ["p" , "span" ],
3532
+ max_chunk_size = 1000 ,
3533
+ )
3523
3534
documents = splitter .split_text (html_content )
3524
3535
3525
3536
expected = [
@@ -3548,12 +3559,13 @@ def test_html_splitter_with_mixed_preserve_and_filter() -> None:
3548
3559
<p>This paragraph should be kept.</p>
3549
3560
<span>This span should be removed.</span>
3550
3561
"""
3551
- splitter = HTMLSemanticPreservingSplitter (
3552
- headers_to_split_on = [("h1" , "Header 1" )],
3553
- elements_to_preserve = ["table" ],
3554
- denylist_tags = ["span" ],
3555
- max_chunk_size = 1000 ,
3556
- )
3562
+ with suppress_langchain_beta_warning ():
3563
+ splitter = HTMLSemanticPreservingSplitter (
3564
+ headers_to_split_on = [("h1" , "Header 1" )],
3565
+ elements_to_preserve = ["table" ],
3566
+ denylist_tags = ["span" ],
3567
+ max_chunk_size = 1000 ,
3568
+ )
3557
3569
documents = splitter .split_text (html_content )
3558
3570
3559
3571
expected = [
@@ -3574,10 +3586,11 @@ def test_html_splitter_with_no_headers() -> None:
3574
3586
<p>This is content without any headers.</p>
3575
3587
<p>It should still produce a valid document.</p>
3576
3588
"""
3577
- splitter = HTMLSemanticPreservingSplitter (
3578
- headers_to_split_on = [],
3579
- max_chunk_size = 1000 ,
3580
- )
3589
+ with suppress_langchain_beta_warning ():
3590
+ splitter = HTMLSemanticPreservingSplitter (
3591
+ headers_to_split_on = [],
3592
+ max_chunk_size = 1000 ,
3593
+ )
3581
3594
documents = splitter .split_text (html_content )
3582
3595
3583
3596
expected = [
@@ -3607,13 +3620,14 @@ def test_html_splitter_with_media_preservation() -> None:
3607
3620
<p>This is audio:</p>
3608
3621
<audio src="http://example.com/audio.mp3"></audio>
3609
3622
"""
3610
- splitter = HTMLSemanticPreservingSplitter (
3611
- headers_to_split_on = [("h1" , "Header 1" )],
3612
- preserve_images = True ,
3613
- preserve_videos = True ,
3614
- preserve_audio = True ,
3615
- max_chunk_size = 1000 ,
3616
- )
3623
+ with suppress_langchain_beta_warning ():
3624
+ splitter = HTMLSemanticPreservingSplitter (
3625
+ headers_to_split_on = [("h1" , "Header 1" )],
3626
+ preserve_images = True ,
3627
+ preserve_videos = True ,
3628
+ preserve_audio = True ,
3629
+ max_chunk_size = 1000 ,
3630
+ )
3617
3631
documents = splitter .split_text (html_content )
3618
3632
3619
3633
expected = [
@@ -3638,12 +3652,13 @@ def test_html_splitter_keep_separator_true() -> None:
3638
3652
<h1>Section 1</h1>
3639
3653
<p>This is some text. This is some other text.</p>
3640
3654
"""
3641
- splitter = HTMLSemanticPreservingSplitter (
3642
- headers_to_split_on = [("h1" , "Header 1" )],
3643
- max_chunk_size = 10 ,
3644
- separators = [". " ],
3645
- keep_separator = True ,
3646
- )
3655
+ with suppress_langchain_beta_warning ():
3656
+ splitter = HTMLSemanticPreservingSplitter (
3657
+ headers_to_split_on = [("h1" , "Header 1" )],
3658
+ max_chunk_size = 10 ,
3659
+ separators = [". " ],
3660
+ keep_separator = True ,
3661
+ )
3647
3662
documents = splitter .split_text (html_content )
3648
3663
3649
3664
expected = [
@@ -3667,12 +3682,13 @@ def test_html_splitter_keep_separator_false() -> None:
3667
3682
<h1>Section 1</h1>
3668
3683
<p>This is some text. This is some other text.</p>
3669
3684
"""
3670
- splitter = HTMLSemanticPreservingSplitter (
3671
- headers_to_split_on = [("h1" , "Header 1" )],
3672
- max_chunk_size = 10 ,
3673
- separators = [". " ],
3674
- keep_separator = False ,
3675
- )
3685
+ with suppress_langchain_beta_warning ():
3686
+ splitter = HTMLSemanticPreservingSplitter (
3687
+ headers_to_split_on = [("h1" , "Header 1" )],
3688
+ max_chunk_size = 10 ,
3689
+ separators = [". " ],
3690
+ keep_separator = False ,
3691
+ )
3676
3692
documents = splitter .split_text (html_content )
3677
3693
3678
3694
expected = [
@@ -3696,12 +3712,13 @@ def test_html_splitter_keep_separator_start() -> None:
3696
3712
<h1>Section 1</h1>
3697
3713
<p>This is some text. This is some other text.</p>
3698
3714
"""
3699
- splitter = HTMLSemanticPreservingSplitter (
3700
- headers_to_split_on = [("h1" , "Header 1" )],
3701
- max_chunk_size = 10 ,
3702
- separators = [". " ],
3703
- keep_separator = "start" ,
3704
- )
3715
+ with suppress_langchain_beta_warning ():
3716
+ splitter = HTMLSemanticPreservingSplitter (
3717
+ headers_to_split_on = [("h1" , "Header 1" )],
3718
+ max_chunk_size = 10 ,
3719
+ separators = [". " ],
3720
+ keep_separator = "start" ,
3721
+ )
3705
3722
documents = splitter .split_text (html_content )
3706
3723
3707
3724
expected = [
@@ -3725,12 +3742,13 @@ def test_html_splitter_keep_separator_end() -> None:
3725
3742
<h1>Section 1</h1>
3726
3743
<p>This is some text. This is some other text.</p>
3727
3744
"""
3728
- splitter = HTMLSemanticPreservingSplitter (
3729
- headers_to_split_on = [("h1" , "Header 1" )],
3730
- max_chunk_size = 10 ,
3731
- separators = [". " ],
3732
- keep_separator = "end" ,
3733
- )
3745
+ with suppress_langchain_beta_warning ():
3746
+ splitter = HTMLSemanticPreservingSplitter (
3747
+ headers_to_split_on = [("h1" , "Header 1" )],
3748
+ max_chunk_size = 10 ,
3749
+ separators = [". " ],
3750
+ keep_separator = "end" ,
3751
+ )
3734
3752
documents = splitter .split_text (html_content )
3735
3753
3736
3754
expected = [
@@ -3754,9 +3772,12 @@ def test_html_splitter_keep_separator_default() -> None:
3754
3772
<h1>Section 1</h1>
3755
3773
<p>This is some text. This is some other text.</p>
3756
3774
"""
3757
- splitter = HTMLSemanticPreservingSplitter (
3758
- headers_to_split_on = [("h1" , "Header 1" )], max_chunk_size = 10 , separators = [". " ]
3759
- )
3775
+ with suppress_langchain_beta_warning ():
3776
+ splitter = HTMLSemanticPreservingSplitter (
3777
+ headers_to_split_on = [("h1" , "Header 1" )],
3778
+ max_chunk_size = 10 ,
3779
+ separators = [". " ],
3780
+ )
3760
3781
documents = splitter .split_text (html_content )
3761
3782
3762
3783
expected = [
0 commit comments