|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
3 | 3 | describe DiscourseAi::Translation::ContentSplitter do |
4 | | - let(:original_limit) { 4000 } |
5 | | - |
6 | | - after { described_class.const_set(:CHUNK_SIZE, original_limit) } |
7 | | - |
8 | | - def set_limit(value) |
9 | | - described_class.const_set(:CHUNK_SIZE, value) |
10 | | - end |
11 | | - |
12 | 4 | it "returns empty array for empty input" do |
13 | | - expect(described_class.split("")).to eq([""]) |
| 5 | + expect(described_class.split(content: "")).to eq([""]) |
14 | 6 | end |
15 | 7 |
|
16 | 8 | it "handles content with only spaces" do |
17 | | - expect(described_class.split(" ")).to eq([" "]) |
18 | | - expect(described_class.split(" ")).to eq([" "]) |
| 9 | + expect(described_class.split(content: " ")).to eq([" "]) |
| 10 | + expect(described_class.split(content: " ")).to eq([" "]) |
19 | 11 | end |
20 | 12 |
|
21 | 13 | it "handles nil input" do |
22 | | - expect(described_class.split(nil)).to eq([]) |
| 14 | + expect(described_class.split(content: nil)).to eq([]) |
23 | 15 | end |
24 | 16 |
|
25 | 17 | it "doesn't split content under limit" do |
26 | | - text = "hello world" |
27 | | - expect(described_class.split(text)).to eq([text]) |
| 18 | + content = "hello world" |
| 19 | + expect(described_class.split(content:, chunk_size: 20)).to eq([content]) |
| 20 | + end |
| 21 | + |
| 22 | + it "splits to max chunk size if unsplittable" do |
| 23 | + content = "a" * 100 |
| 24 | + expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10) |
28 | 25 | end |
29 | 26 |
|
30 | 27 | it "preserves HTML tags" do |
31 | | - set_limit(10) |
32 | | - text = "<p>hello</p><p>meow</p>" |
33 | | - expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>]) |
| 28 | + content = "<p>hello</p><p>meow</p>" |
| 29 | + expect(described_class.split(content:, chunk_size: 15)).to eq(%w[<p>hello</p> <p>meow</p>]) |
34 | 30 |
|
35 | | - set_limit(35) |
36 | | - text = "<div>hello</div> <div>jurassic</div> <p>world</p>" |
37 | | - expect(described_class.split(text)).to eq( |
38 | | - ["<div>hello</div> <div>jurassic</div>", " <p>world</p>"], |
| 31 | + content = "<div>hello</div> <div>jurassic</div> <p>world</p>" |
| 32 | + expect(described_class.split(content:, chunk_size: 40)).to eq( |
| 33 | + ["<div>hello</div> <div>jurassic</div> ", "<p>world</p>"], |
39 | 34 | ) |
40 | 35 | end |
41 | 36 |
|
42 | 37 | it "preserves BBCode tags" do |
43 | | - set_limit(20) |
44 | | - text = "[quote]hello[/quote][details]world[/details]" |
45 | | - expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"]) |
| 38 | + content = "[quote]hello[/quote][details]world[/details]" |
| 39 | + expect(described_class.split(content:, chunk_size: 25)).to eq( |
| 40 | + ["[quote]hello[/quote]", "[details]world[/details]"], |
| 41 | + ) |
46 | 42 | end |
47 | 43 |
|
48 | 44 | it "doesn't split in middle of words" do |
49 | | - set_limit(10) |
50 | | - text = "my kitty best in the world" |
51 | | - expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"]) |
| 45 | + content = "my kitty best in the world" |
| 46 | + expect(described_class.split(content:, chunk_size: 10)).to eq( |
| 47 | + ["my kitty ", "best in ", "the world"], |
| 48 | + ) |
52 | 49 | end |
53 | 50 |
|
54 | 51 | it "handles nested tags properly" do |
55 | | - set_limit(25) |
56 | | - text = "<div>hello<p>cat</p>world</div><p>meow</p>" |
57 | | - expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>]) |
| 52 | + content = "<div>hello<p>cat</p>world</div><p>meow</p>" |
| 53 | + expect(described_class.split(content:, chunk_size: 35)).to eq( |
| 54 | + %w[<div>hello<p>cat</p>world</div> <p>meow</p>], |
| 55 | + ) |
58 | 56 | end |
59 | 57 |
|
60 | 58 | it "handles mixed HTML and BBCode" do |
61 | | - set_limit(15) |
62 | | - text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>" |
63 | | - expect(described_class.split(text)).to eq( |
| 59 | + content = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>" |
| 60 | + expect(described_class.split(content:, chunk_size: 20)).to eq( |
64 | 61 | ["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"], |
65 | 62 | ) |
66 | 63 | end |
67 | 64 |
|
68 | 65 | it "preserves newlines in sensible places" do |
69 | | - set_limit(10) |
70 | | - text = "hello\nbeautiful\nworld\n" |
71 | | - expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"]) |
| 66 | + content = "hello\nbeautiful\nworld\n" |
| 67 | + expect(described_class.split(content:, chunk_size: 10)).to eq( |
| 68 | + ["hello\n", "beautiful\n", "world\n"], |
| 69 | + ) |
72 | 70 | end |
73 | 71 |
|
74 | 72 | it "handles email content properly" do |
75 | | - set_limit(20) |
76 | | - text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here" |
77 | | - expect(described_class.split(text)).to eq( |
| 73 | + content = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here" |
| 74 | + expect(described_class.split(content:, chunk_size: 20)).to eq( |
78 | 75 | ["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"], |
79 | 76 | ) |
80 | 77 | end |
81 | 78 |
|
82 | 79 | it "keeps code blocks intact" do |
83 | | - set_limit(30) |
84 | | - text = "Text\n```\ncode block\nhere\n```\nmore text" |
85 | | - expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"]) |
| 80 | + content = "Text\n```\ncode block\nhere\n```\nmore text" |
| 81 | + expect(described_class.split(content:, chunk_size: 30)).to eq( |
| 82 | + ["Text\n```\ncode block\nhere\n```\n", "more text"], |
| 83 | + ) |
86 | 84 | end |
87 | 85 |
|
88 | 86 | context "with multiple details tags" do |
89 | 87 | it "splits correctly between details tags" do |
90 | | - set_limit(30) |
91 | | - text = "<details>first content</details><details>second content</details>" |
92 | | - expect(described_class.split(text)).to eq( |
| 88 | + content = "<details>first content</details><details>second content</details>" |
| 89 | + expect(described_class.split(content:, chunk_size: 35)).to eq( |
93 | 90 | ["<details>first content</details>", "<details>second content</details>"], |
94 | 91 | ) |
95 | 92 | end |
|
0 commit comments