Skip to content

Commit 88fbf91

Browse files
committed
fix(benchmark): restore supported formats, fix C# MIME mappings, bump version
Only remove pbm/pgm/ppm/pnm (truly unsupported), restore jp2/jpx/jpm/mj2, typst/typ, and djot which are supported by kreuzberg core. Add missing MIME type mappings in C# GuessMimeType for jp2, jpx, jpm, mj2, typst, typ, djot, and markdown extensions.
1 parent 25e7448 commit 88fbf91

File tree

8 files changed

+38
-26
lines changed

8 files changed

+38
-26
lines changed

Cargo.lock

Lines changed: 16 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ num_cpus = "1.17.0"
6060
once_cell = "1.21.3"
6161
parking_lot = "0.12.5"
6262
html-to-markdown-rs = { version = "2.24.5", default-features = false }
63-
reqwest = { version = "0.13.1", default-features = false }
63+
reqwest = { version = "0.13.2", default-features = false }
6464
image = { version = "0.25.9", default-features = false }
6565
clap = { version = "4.5", features = ["derive", "color", "suggestions"] }
6666
toml = "0.9.11"

crates/kreuzberg/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ simdutf8 = { version = "0.1", optional = true }
126126
hex = { workspace = true }
127127
lazy_static = "1.5.0"
128128
libc = { workspace = true }
129-
memchr = "2.7.6"
129+
memchr = "2.8.0"
130130
num_cpus = { workspace = true }
131131
once_cell = { workspace = true }
132132
parking_lot = { workspace = true }

packages/csharp/Benchmark/Program.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ static string GuessMimeType(string path)
220220
".html" or ".htm" => "text/html",
221221
".xml" => "application/xml",
222222
".txt" => "text/plain",
223-
".md" or ".commonmark" => "text/markdown",
223+
".md" or ".markdown" or ".commonmark" => "text/markdown",
224224
".csv" => "text/csv",
225225
".tsv" => "text/tab-separated-values",
226226
".rtf" => "application/rtf",
@@ -240,11 +240,17 @@ static string GuessMimeType(string path)
240240
".gif" => "image/gif",
241241
".bmp" => "image/bmp",
242242
".webp" => "image/webp",
243+
".jp2" => "image/jp2",
244+
".jpx" => "image/jpx",
245+
".jpm" => "image/jpm",
246+
".mj2" => "video/mj2",
243247
".svg" => "image/svg+xml",
244248
".zip" => "application/zip",
245249
".tar" => "application/x-tar",
246250
".gz" or ".tgz" => "application/gzip",
247251
".7z" => "application/x-7z-compressed",
252+
".typst" or ".typ" => "application/x-typst",
253+
".djot" => "text/djot",
248254
_ => "application/octet-stream",
249255
};
250256
}

tools/benchmark-harness/src/adapters/external.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ fn get_supported_formats(framework_name: &str) -> Vec<String> {
3737
"pdf", // E-books
3838
"epub", // Vector/text
3939
"svg", "txt", // Images (for OCR) - gif and webp NOT supported by PyMuPDF
40-
"png", "jpg", "jpeg", "bmp", "tiff", "tif", "pnm", "pgm", "pbm", "ppm",
40+
"png", "jpg", "jpeg", "bmp", "tiff", "tif",
4141
]
4242
.into_iter()
4343
.map(|s| s.to_string())
@@ -68,8 +68,7 @@ fn get_supported_formats(framework_name: &str) -> Vec<String> {
6868
"csv", "tsv", "json", "yaml", "yml", "toml", // Email
6969
"eml", "msg", // Scientific/technical (typst not supported - too new)
7070
"tex", "latex", "bib", "rst", "org", "ipynb", // Images (metadata + OCR)
71-
"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp", "jp2", "pnm", "pgm", "pbm", "ppm",
72-
// Archives
71+
"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp", "jp2", // Archives
7372
"zip", "tar", "gz", "7z",
7473
]
7574
.into_iter()

tools/benchmark-harness/src/adapters/kreuzberg.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,21 @@ fn get_kreuzberg_supported_formats() -> Vec<String> {
7272
"tiff",
7373
"tif",
7474
"webp",
75+
"jp2",
76+
"jpx",
77+
"jpm",
78+
"mj2",
7579
// Academic/Publishing
7680
"epub",
7781
"bib",
7882
"ipynb",
7983
"tex",
8084
"latex",
85+
"typst",
86+
"typ",
8187
// Other
8288
"svg",
89+
"djot",
8390
]
8491
.into_iter()
8592
.map(|s| s.to_string())

tools/benchmark-harness/src/adapters/native.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ impl FrameworkAdapter for NativeAdapter {
110110
"zip" | "tar" | "gz" | "tgz" | "7z" |
111111
// Images (OCR supported)
112112
"bmp" | "gif" | "jpg" | "jpeg" | "png" | "tiff" | "tif" | "webp" |
113-
"jp2" | "jpx" | "jpm" | "mj2" | "pnm" | "pbm" | "pgm" | "ppm" |
113+
"jp2" | "jpx" | "jpm" | "mj2" |
114114
// Academic/Publishing
115115
"epub" | "bib" | "ipynb" | "tex" | "latex" | "typst" | "typ" |
116116
// Other

uv.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)