Skip to content

Commit 6cade9f

Browse files
authored
Merge pull request #313 from kreuzberg-dev/fix-issue-309
fix(elixir): resolve DOCX keyword extraction FunctionClauseError
2 parents deca129 + efa9cc4 commit 6cade9f

File tree

5 files changed

+243
-2
lines changed

5 files changed

+243
-2
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
9393
- Total 32 public types now properly exported for IDE autocomplete and type checking
9494
- Resolves import failures where types were defined but not accessible
9595

96+
#### Elixir
97+
- **DOCX keyword extraction**: Fixed `FunctionClauseError` when extracting DOCX files with keywords metadata ([#309](https://github.com/kreuzberg-dev/kreuzberg/issues/309))
98+
- DOCX extractor now parses comma-separated keyword strings into `Vec<String>` and stores in typed `Metadata.keywords` field
99+
- Added defensive string handling to `normalize_keywords/1` in Elixir binding
100+
- Resolves crash when extracting DOCX files containing keywords in `cp:keywords` or `dc:subject` metadata fields
101+
- Added comprehensive unit tests for keyword string parsing in both Rust and Elixir
102+
96103
---
97104

98105
## [4.0.8] - 2026-01-17

crates/kreuzberg/src/extractors/docx.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ impl DocumentExtractor for DocxExtractor {
182182
};
183183

184184
let mut metadata_map = std::collections::HashMap::new();
185+
let mut parsed_keywords: Option<Vec<String>> = None;
185186

186187
if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
187188
if let Some(title) = core.title {
@@ -198,7 +199,14 @@ impl DocumentExtractor for DocxExtractor {
198199
metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
199200
}
200201
if let Some(keywords) = core.keywords {
201-
metadata_map.insert("keywords".to_string(), serde_json::Value::String(keywords));
202+
// Parse comma-separated keywords into Vec<String>
203+
parsed_keywords = Some(
204+
keywords
205+
.split(',')
206+
.map(|s| s.trim().to_string())
207+
.filter(|s| !s.is_empty())
208+
.collect(),
209+
);
202210
}
203211
if let Some(description) = core.description {
204212
metadata_map.insert("description".to_string(), serde_json::Value::String(description));
@@ -296,6 +304,7 @@ impl DocumentExtractor for DocxExtractor {
296304
mime_type: mime_type.to_string(),
297305
metadata: Metadata {
298306
pages: page_structure,
307+
keywords: parsed_keywords,
299308
additional: metadata_map,
300309
..Default::default()
301310
},

crates/kreuzberg/tests/docx_metadata_extraction_test.rs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
120120

121121
println!("✅ DOCX minimal metadata extraction test passed!");
122122
}
123+
124+
#[tokio::test]
125+
async fn test_docx_keywords_extraction() {
126+
// This test verifies that DOCX keywords metadata is properly parsed
127+
// from comma-separated strings into Vec<String> in Metadata.keywords
128+
//
129+
// Addresses GitHub issue #309: DOCX keyword extraction was returning
130+
// strings instead of parsed keyword lists, causing FunctionClauseError
131+
// in the Elixir binding.
132+
133+
use std::io::Write;
134+
use tempfile::NamedTempFile;
135+
use zip::CompressionMethod;
136+
use zip::write::{FileOptions, ZipWriter};
137+
138+
// Create a minimal DOCX with keywords metadata
139+
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
140+
141+
{
142+
let mut zip = ZipWriter::new(&mut temp_file);
143+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
144+
145+
// Add [Content_Types].xml
146+
zip.start_file("[Content_Types].xml", options).unwrap();
147+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
148+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
149+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
150+
<Default Extension="xml" ContentType="application/xml"/>
151+
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
152+
<Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
153+
</Types>"#).unwrap();
154+
155+
// Add _rels/.rels
156+
zip.start_file("_rels/.rels", options).unwrap();
157+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
158+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
161+
</Relationships>"#).unwrap();
162+
163+
// Add word/document.xml with simple content
164+
zip.start_file("word/document.xml", options).unwrap();
165+
zip.write_all(
166+
br#"<?xml version="1.0" encoding="UTF-8"?>
167+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
168+
<w:body>
169+
<w:p>
170+
<w:r>
171+
<w:t>Test document for keyword extraction</w:t>
172+
</w:r>
173+
</w:p>
174+
</w:body>
175+
</w:document>"#,
176+
)
177+
.unwrap();
178+
179+
// Add docProps/core.xml with keywords (comma-separated string)
180+
zip.start_file("docProps/core.xml", options).unwrap();
181+
zip.write_all(
182+
br#"<?xml version="1.0" encoding="UTF-8"?>
183+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
184+
xmlns:dc="http://purl.org/dc/elements/1.1/"
185+
xmlns:dcterms="http://purl.org/dc/terms/">
186+
<dc:title>Test Document</dc:title>
187+
<dc:creator>Test Author</dc:creator>
188+
<cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
189+
<dc:subject>Testing keyword extraction</dc:subject>
190+
</cp:coreProperties>"#,
191+
)
192+
.unwrap();
193+
194+
zip.finish().unwrap();
195+
}
196+
197+
// Extract the DOCX file
198+
let result = extract_file(
199+
temp_file.path(),
200+
Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
201+
&ExtractionConfig::default(),
202+
)
203+
.await
204+
.expect("Should extract DOCX with keywords successfully");
205+
206+
// Verify content was extracted
207+
assert!(!result.content.is_empty(), "Content should not be empty");
208+
assert!(
209+
result.content.contains("Test document for keyword extraction"),
210+
"Content should match document text"
211+
);
212+
213+
// Verify keywords were parsed into Vec<String> in Metadata.keywords
214+
assert!(
215+
result.metadata.keywords.is_some(),
216+
"Keywords should be present in metadata.keywords"
217+
);
218+
219+
let keywords = result.metadata.keywords.as_ref().unwrap();
220+
assert_eq!(
221+
keywords.len(),
222+
5,
223+
"Should have 5 keywords parsed from comma-separated string"
224+
);
225+
226+
// Verify individual keywords were trimmed and parsed correctly
227+
assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
228+
assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
229+
assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
230+
assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
231+
assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
232+
233+
// Verify other metadata was also extracted
234+
assert_eq!(
235+
result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
236+
Some("Test Author"),
237+
"Should have correct creator"
238+
);
239+
assert_eq!(
240+
result.metadata.additional.get("title").and_then(|v| v.as_str()),
241+
Some("Test Document"),
242+
"Should have correct title"
243+
);
244+
assert_eq!(
245+
result.metadata.additional.get("subject").and_then(|v| v.as_str()),
246+
Some("Testing keyword extraction"),
247+
"Should have correct subject"
248+
);
249+
250+
println!("✅ DOCX keywords extraction test passed!");
251+
println!(" Extracted keywords: {:?}", keywords);
252+
}

packages/elixir/lib/kreuzberg/result.ex

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,19 @@ defmodule Kreuzberg.ExtractionResult do
262262
end
263263

264264
@doc false
265-
@spec normalize_keywords(list() | nil) :: list(map()) | nil
265+
@spec normalize_keywords(list() | String.t() | nil) :: list(map()) | nil
266266
defp normalize_keywords(nil), do: nil
267267
defp normalize_keywords([]), do: []
268268
defp normalize_keywords(keywords) when is_list(keywords), do: keywords
269269

270+
defp normalize_keywords(keywords) when is_binary(keywords) do
271+
keywords
272+
|> String.split(",")
273+
|> Enum.map(&String.trim/1)
274+
|> Enum.reject(&(&1 == ""))
275+
|> Enum.map(fn text -> %{"text" => text, "score" => 1.0} end)
276+
end
277+
270278
@doc false
271279
@spec normalize_elements(list() | nil) :: list(Kreuzberg.Element.t()) | nil
272280
defp normalize_elements(nil), do: nil

packages/elixir/test/unit/extraction_result_test.exs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,4 +407,91 @@ defmodule KreuzbergTest.Unit.ExtractionResultTest do
407407
assert Enum.uniq(contents) == contents
408408
end
409409
end
410+
411+
describe "normalize_keywords/1 - keyword string parsing (GitHub issue #309)" do
412+
test "parses comma-separated keyword string from DOCX metadata" do
413+
# This test addresses GitHub issue #309: DOCX files return keywords
414+
# as comma-separated strings from metadata, which caused FunctionClauseError
415+
# before the fix was implemented.
416+
keywords_string = "calibre, docs, ebook, conversion"
417+
418+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)
419+
420+
assert is_list(result.keywords)
421+
assert length(result.keywords) == 4
422+
423+
assert Enum.at(result.keywords, 0) == %{"text" => "calibre", "score" => 1.0}
424+
assert Enum.at(result.keywords, 1) == %{"text" => "docs", "score" => 1.0}
425+
assert Enum.at(result.keywords, 2) == %{"text" => "ebook", "score" => 1.0}
426+
assert Enum.at(result.keywords, 3) == %{"text" => "conversion", "score" => 1.0}
427+
end
428+
429+
test "parses keyword string with extra whitespace" do
430+
keywords_string = " keyword1 , keyword2 , keyword3 "
431+
432+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)
433+
434+
assert is_list(result.keywords)
435+
assert length(result.keywords) == 3
436+
437+
# Verify whitespace is properly trimmed
438+
assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0}
439+
assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0}
440+
assert Enum.at(result.keywords, 2) == %{"text" => "keyword3", "score" => 1.0}
441+
end
442+
443+
test "handles keyword string with trailing/leading commas" do
444+
keywords_string = ",keyword1,keyword2,"
445+
446+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)
447+
448+
assert is_list(result.keywords)
449+
# Empty strings from leading/trailing commas should be filtered out
450+
assert length(result.keywords) == 2
451+
assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0}
452+
assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0}
453+
end
454+
455+
test "handles empty keyword string" do
456+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "")
457+
458+
assert result.keywords == []
459+
end
460+
461+
test "handles keyword string with only whitespace" do
462+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: " ")
463+
464+
assert result.keywords == []
465+
end
466+
467+
test "handles single keyword in string" do
468+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "single")
469+
470+
assert is_list(result.keywords)
471+
assert length(result.keywords) == 1
472+
assert Enum.at(result.keywords, 0) == %{"text" => "single", "score" => 1.0}
473+
end
474+
475+
test "assigns default score of 1.0 to parsed keywords" do
476+
# Keywords from DOCX metadata don't have scores, so we assign default 1.0
477+
keywords_string = "keyword1, keyword2"
478+
479+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)
480+
481+
Enum.each(result.keywords, fn keyword ->
482+
assert keyword["score"] == 1.0
483+
end)
484+
end
485+
486+
test "preserves keyword order from string" do
487+
keywords_string = "first, second, third, fourth"
488+
489+
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)
490+
491+
assert Enum.at(result.keywords, 0)["text"] == "first"
492+
assert Enum.at(result.keywords, 1)["text"] == "second"
493+
assert Enum.at(result.keywords, 2)["text"] == "third"
494+
assert Enum.at(result.keywords, 3)["text"] == "fourth"
495+
end
496+
end
410497
end

0 commit comments

Comments
 (0)