Skip to content

Commit 77c158f

Browse files
committed
feat(e2e): add keyword/content assertions, new fixtures, remove redundant tests
- Add KeywordAssertion and content_not_empty to all 11 generators - Add has_keywords support across all language generators - Add contract, embeddings, keywords, token_reduction fixtures - Add post-generation formatting for Go, Java, C#, PHP, Elixir, Ruby - Regenerate all e2e tests with new assertions - Remove 58 redundant integration tests from bindings (~33k lines) - Fix collapsible_if clippy warnings in core extractors
1 parent 865c45a commit 77c158f

File tree

163 files changed

+5272
-33105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+5272
-33105
lines changed

crates/kreuzberg/src/extractors/jupyter.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -323,11 +323,11 @@ impl JupyterExtractor {
323323
}
324324

325325
// Include JSON output as structured data
326-
if let Some(json_content) = data.get("application/json") {
327-
if let Ok(formatted) = serde_json::to_string_pretty(json_content) {
328-
content.push_str(&formatted);
329-
content.push('\n');
330-
}
326+
if let Some(json_content) = data.get("application/json")
327+
&& let Ok(formatted) = serde_json::to_string_pretty(json_content)
328+
{
329+
content.push_str(&formatted);
330+
content.push('\n');
331331
}
332332
}
333333

crates/kreuzberg/src/extractors/markdown.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,13 @@ impl MarkdownExtractor {
8585
link_url = Some(dest_url.to_string());
8686
}
8787
Event::End(TagEnd::Link) => {
88-
if let Some(url) = link_url.take() {
89-
if !url.is_empty() && !url.starts_with('#') {
90-
text.push_str(" (");
91-
text.push_str(&url);
92-
text.push(')');
93-
}
88+
if let Some(url) = link_url.take()
89+
&& !url.is_empty()
90+
&& !url.starts_with('#')
91+
{
92+
text.push_str(" (");
93+
text.push_str(&url);
94+
text.push(')');
9495
}
9596
}
9697
Event::Start(Tag::Image { dest_url, .. }) => {
@@ -101,10 +102,10 @@ impl MarkdownExtractor {
101102
}
102103
text.push(']');
103104
// Extract image from data URIs
104-
if dest_url.starts_with("data:image/") {
105-
if let Some(image) = Self::decode_data_uri_image(dest_url, images.len()) {
106-
images.push(image);
107-
}
105+
if dest_url.starts_with("data:image/")
106+
&& let Some(image) = Self::decode_data_uri_image(dest_url, images.len())
107+
{
108+
images.push(image);
108109
}
109110
}
110111
Event::Start(Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(lang))) => {

crates/kreuzberg/src/paddle_ocr/model_manager.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -442,12 +442,12 @@ impl ModelManager {
442442
for entry in fs::read_dir(&self.cache_dir)? {
443443
let entry = entry?;
444444
let path = entry.path();
445-
if path.is_dir() {
446-
if let Ok(size) = Self::dir_size(&path) {
447-
total_size += size;
448-
if let Ok(entries) = fs::read_dir(&path) {
449-
model_count += entries.count();
450-
}
445+
if path.is_dir()
446+
&& let Ok(size) = Self::dir_size(&path)
447+
{
448+
total_size += size;
449+
if let Ok(entries) = fs::read_dir(&path) {
450+
model_count += entries.count();
451451
}
452452
}
453453
}

e2e/csharp/ArchiveTests.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Xunit;
88
using Kreuzberg.E2E;
99

10-
namespace Kreuzberg.E2E.Archive {
10+
namespace Kreuzberg.E2E.Archive
11+
{
1112
public class ArchiveTests
1213
{
1314
[SkippableFact]

e2e/csharp/ContractTests.cs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Xunit;
88
using Kreuzberg.E2E;
99

10-
namespace Kreuzberg.E2E.Contract {
10+
namespace Kreuzberg.E2E.Contract
11+
{
1112
public class ContractTests
1213
{
1314
[SkippableFact]
@@ -200,6 +201,20 @@ public void ConfigForceOcr()
200201
TestHelpers.AssertMinContentLength(result, 5);
201202
}
202203

204+
[SkippableFact]
205+
public void ConfigHtmlOptions()
206+
{
207+
TestHelpers.SkipIfLegacyOfficeDisabled("html/complex_table.html");
208+
TestHelpers.SkipIfOfficeTestOnWindows("html/complex_table.html");
209+
var documentPath = TestHelpers.EnsureDocument("html/complex_table.html", true);
210+
var config = TestHelpers.BuildConfig("{\"html_options\":{\"include_links\":true}}");
211+
212+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
213+
TestHelpers.AssertExpectedMime(result, new[] { "text/html" });
214+
TestHelpers.AssertMinContentLength(result, 10);
215+
TestHelpers.AssertContentNotEmpty(result);
216+
}
217+
203218
[SkippableFact]
204219
public void ConfigImages()
205220
{
@@ -255,6 +270,20 @@ public void ConfigPages()
255270
TestHelpers.AssertMinContentLength(result, 10);
256271
}
257272

273+
[SkippableFact]
274+
public void ConfigQualityDisabled()
275+
{
276+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
277+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
278+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
279+
var config = TestHelpers.BuildConfig("{\"enable_quality_processing\":false}");
280+
281+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
282+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
283+
TestHelpers.AssertMinContentLength(result, 10);
284+
TestHelpers.AssertContentNotEmpty(result);
285+
}
286+
258287
[SkippableFact]
259288
public void ConfigUseCacheFalse()
260289
{
@@ -268,6 +297,21 @@ public void ConfigUseCacheFalse()
268297
TestHelpers.AssertMinContentLength(result, 10);
269298
}
270299

300+
[SkippableFact]
301+
public void OutputFormatBytesMarkdown()
302+
{
303+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
304+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
305+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
306+
var config = TestHelpers.BuildConfig("{\"output_format\":\"markdown\"}");
307+
308+
var fileBytes = File.ReadAllBytes(documentPath);
309+
var mimeType = KreuzbergClient.DetectMimeType(fileBytes);
310+
var result = KreuzbergClient.ExtractBytesSync(fileBytes, mimeType, config);
311+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
312+
TestHelpers.AssertMinContentLength(result, 10);
313+
}
314+
271315
[SkippableFact]
272316
public void OutputFormatDjot()
273317
{

e2e/csharp/EmailTests.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Xunit;
88
using Kreuzberg.E2E;
99

10-
namespace Kreuzberg.E2E.Email {
10+
namespace Kreuzberg.E2E.Email
11+
{
1112
public class EmailTests
1213
{
1314
[SkippableFact]

e2e/csharp/EmbeddingsTests.cs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Code generated by kreuzberg-e2e-generator. DO NOT EDIT.
2+
using System;
3+
using System.Collections.Generic;
4+
using System.IO;
5+
using System.Threading.Tasks;
6+
using Kreuzberg;
7+
using Xunit;
8+
using Kreuzberg.E2E;
9+
10+
namespace Kreuzberg.E2E.Embeddings
11+
{
12+
public class EmbeddingsTests
13+
{
14+
[SkippableFact]
15+
public async Task EmbeddingAsync()
16+
{
17+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
18+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
19+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
20+
var config = TestHelpers.BuildConfig("{\"chunking\":{\"embedding\":{\"model\":{\"preset\":\"balanced\"},\"normalize\":true},\"max_chars\":500,\"max_overlap\":50}}");
21+
22+
var result = await KreuzbergClient.ExtractFileAsync(documentPath, config);
23+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
24+
TestHelpers.AssertMinContentLength(result, 10);
25+
TestHelpers.AssertChunks(result, 1, null, true, true);
26+
}
27+
28+
[SkippableFact]
29+
public void EmbeddingBalancedPreset()
30+
{
31+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
32+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
33+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
34+
var config = TestHelpers.BuildConfig("{\"chunking\":{\"embedding\":{\"model\":{\"preset\":\"balanced\"},\"normalize\":true},\"max_chars\":500,\"max_overlap\":50}}");
35+
36+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
37+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
38+
TestHelpers.AssertMinContentLength(result, 10);
39+
TestHelpers.AssertChunks(result, 1, null, true, true);
40+
}
41+
42+
[SkippableFact]
43+
public void EmbeddingDisabled()
44+
{
45+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
46+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
47+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
48+
var config = TestHelpers.BuildConfig("{\"chunking\":{\"max_chars\":500,\"max_overlap\":50}}");
49+
50+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
51+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
52+
TestHelpers.AssertMinContentLength(result, 10);
53+
TestHelpers.AssertChunks(result, 1, null, true, false);
54+
}
55+
56+
[SkippableFact]
57+
public void EmbeddingFastPreset()
58+
{
59+
TestHelpers.SkipIfLegacyOfficeDisabled("pdf/fake_memo.pdf");
60+
TestHelpers.SkipIfOfficeTestOnWindows("pdf/fake_memo.pdf");
61+
var documentPath = TestHelpers.EnsureDocument("pdf/fake_memo.pdf", true);
62+
var config = TestHelpers.BuildConfig("{\"chunking\":{\"embedding\":{\"model\":{\"preset\":\"fast\"},\"normalize\":true},\"max_chars\":500,\"max_overlap\":50}}");
63+
64+
var result = KreuzbergClient.ExtractFileSync(documentPath, config);
65+
TestHelpers.AssertExpectedMime(result, new[] { "application/pdf" });
66+
TestHelpers.AssertMinContentLength(result, 10);
67+
TestHelpers.AssertChunks(result, 1, null, true, true);
68+
}
69+
70+
}
71+
}

e2e/csharp/Helpers.cs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,4 +668,48 @@ public static void AssertDocument(
668668
}
669669
}
670670
}
671+
672+
public static void AssertKeywords(
673+
ExtractionResult result,
674+
bool? hasKeywords,
675+
int? minCount,
676+
int? maxCount)
677+
{
678+
var keywords = result.Keywords;
679+
if (hasKeywords == true)
680+
{
681+
if (keywords is null || keywords.Count == 0)
682+
{
683+
throw new XunitException("Expected keywords but got null or empty");
684+
}
685+
}
686+
if (hasKeywords == false)
687+
{
688+
if (keywords is not null && keywords.Count > 0)
689+
{
690+
throw new XunitException("Expected keywords to be null or empty");
691+
}
692+
return;
693+
}
694+
if (keywords is not null)
695+
{
696+
var count = keywords.Count;
697+
if (minCount.HasValue && count < minCount.Value)
698+
{
699+
throw new XunitException($"Expected at least {minCount.Value} keywords, found {count}");
700+
}
701+
if (maxCount.HasValue && count > maxCount.Value)
702+
{
703+
throw new XunitException($"Expected at most {maxCount.Value} keywords, found {count}");
704+
}
705+
}
706+
}
707+
708+
public static void AssertContentNotEmpty(ExtractionResult result)
709+
{
710+
if (string.IsNullOrEmpty(result.Content))
711+
{
712+
throw new XunitException("Expected content to be non-empty, but it is empty");
713+
}
714+
}
671715
}

e2e/csharp/HtmlTests.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Xunit;
88
using Kreuzberg.E2E;
99

10-
namespace Kreuzberg.E2E.Html {
10+
namespace Kreuzberg.E2E.Html
11+
{
1112
public class HtmlTests
1213
{
1314
[SkippableFact]

e2e/csharp/ImageTests.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Xunit;
88
using Kreuzberg.E2E;
99

10-
namespace Kreuzberg.E2E.Image {
10+
namespace Kreuzberg.E2E.Image
11+
{
1112
public class ImageTests
1213
{
1314
[SkippableFact]

0 commit comments

Comments
 (0)