Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Total 32 public types now properly exported for IDE autocomplete and type checking
- Resolves import failures where types were defined but not accessible

#### Elixir
- **DOCX keyword extraction**: Fixed `FunctionClauseError` when extracting DOCX files with keywords metadata ([#309](https://github.com/kreuzberg-dev/kreuzberg/issues/309))
- DOCX extractor now parses comma-separated keyword strings into `Vec<String>` and stores in typed `Metadata.keywords` field
- Added defensive string handling to `normalize_keywords/1` in Elixir binding
- Resolves crash when extracting DOCX files containing keywords in `cp:keywords` or `dc:subject` metadata fields
- Added comprehensive unit tests for keyword string parsing in both Rust and Elixir

---

## [4.0.8] - 2026-01-17
Expand Down
11 changes: 10 additions & 1 deletion crates/kreuzberg/src/extractors/docx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ impl DocumentExtractor for DocxExtractor {
};

let mut metadata_map = std::collections::HashMap::new();
let mut parsed_keywords: Option<Vec<String>> = None;

if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
if let Some(title) = core.title {
Expand All @@ -198,7 +199,14 @@ impl DocumentExtractor for DocxExtractor {
metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
}
if let Some(keywords) = core.keywords {
metadata_map.insert("keywords".to_string(), serde_json::Value::String(keywords));
// Parse comma-separated keywords into Vec<String>
parsed_keywords = Some(
keywords
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect(),
);
}
if let Some(description) = core.description {
metadata_map.insert("description".to_string(), serde_json::Value::String(description));
Expand Down Expand Up @@ -296,6 +304,7 @@ impl DocumentExtractor for DocxExtractor {
mime_type: mime_type.to_string(),
metadata: Metadata {
pages: page_structure,
keywords: parsed_keywords,
additional: metadata_map,
..Default::default()
},
Expand Down
130 changes: 130 additions & 0 deletions crates/kreuzberg/tests/docx_metadata_extraction_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {

println!("✅ DOCX minimal metadata extraction test passed!");
}

#[tokio::test]
async fn test_docx_keywords_extraction() {
// This test verifies that DOCX keywords metadata is properly parsed
// from comma-separated strings into Vec<String> in Metadata.keywords
//
// Addresses GitHub issue #309: DOCX keyword extraction was returning
// strings instead of parsed keyword lists, causing FunctionClauseError
// in the Elixir binding.

use std::io::Write;
use tempfile::NamedTempFile;
use zip::CompressionMethod;
use zip::write::{FileOptions, ZipWriter};

// Create a minimal DOCX with keywords metadata
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");

{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);

// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
<Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
</Types>"#).unwrap();

// Add _rels/.rels
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
</Relationships>"#).unwrap();

// Add word/document.xml with simple content
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r>
<w:t>Test document for keyword extraction</w:t>
</w:r>
</w:p>
</w:body>
</w:document>"#,
)
.unwrap();

// Add docProps/core.xml with keywords (comma-separated string)
zip.start_file("docProps/core.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/">
<dc:title>Test Document</dc:title>
<dc:creator>Test Author</dc:creator>
<cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
<dc:subject>Testing keyword extraction</dc:subject>
</cp:coreProperties>"#,
)
.unwrap();

zip.finish().unwrap();
}

// Extract the DOCX file
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
&ExtractionConfig::default(),
)
.await
.expect("Should extract DOCX with keywords successfully");

// Verify content was extracted
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(
result.content.contains("Test document for keyword extraction"),
"Content should match document text"
);

// Verify keywords were parsed into Vec<String> in Metadata.keywords
assert!(
result.metadata.keywords.is_some(),
"Keywords should be present in metadata.keywords"
);

let keywords = result.metadata.keywords.as_ref().unwrap();
assert_eq!(
keywords.len(),
5,
"Should have 5 keywords parsed from comma-separated string"
);

// Verify individual keywords were trimmed and parsed correctly
assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");

// Verify other metadata was also extracted
assert_eq!(
result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
Some("Test Author"),
"Should have correct creator"
);
assert_eq!(
result.metadata.additional.get("title").and_then(|v| v.as_str()),
Some("Test Document"),
"Should have correct title"
);
assert_eq!(
result.metadata.additional.get("subject").and_then(|v| v.as_str()),
Some("Testing keyword extraction"),
"Should have correct subject"
);

println!("✅ DOCX keywords extraction test passed!");
println!(" Extracted keywords: {:?}", keywords);
}
10 changes: 9 additions & 1 deletion packages/elixir/lib/kreuzberg/result.ex
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,19 @@ defmodule Kreuzberg.ExtractionResult do
end

@doc false
@spec normalize_keywords(list() | nil) :: list(map()) | nil
@spec normalize_keywords(list() | String.t() | nil) :: list(map()) | nil
defp normalize_keywords(nil), do: nil
defp normalize_keywords([]), do: []
defp normalize_keywords(keywords) when is_list(keywords), do: keywords

defp normalize_keywords(keywords) when is_binary(keywords) do
keywords
|> String.split(",")
|> Enum.map(&String.trim/1)
|> Enum.reject(&(&1 == ""))
|> Enum.map(fn text -> %{"text" => text, "score" => 1.0} end)
end

@doc false
@spec normalize_elements(list() | nil) :: list(Kreuzberg.Element.t()) | nil
defp normalize_elements(nil), do: nil
Expand Down
87 changes: 87 additions & 0 deletions packages/elixir/test/unit/extraction_result_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -407,4 +407,91 @@ defmodule KreuzbergTest.Unit.ExtractionResultTest do
assert Enum.uniq(contents) == contents
end
end

describe "normalize_keywords/1 - keyword string parsing (GitHub issue #309)" do
test "parses comma-separated keyword string from DOCX metadata" do
# This test addresses GitHub issue #309: DOCX files return keywords
# as comma-separated strings from metadata, which caused FunctionClauseError
# before the fix was implemented.
keywords_string = "calibre, docs, ebook, conversion"

result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)

assert is_list(result.keywords)
assert length(result.keywords) == 4

assert Enum.at(result.keywords, 0) == %{"text" => "calibre", "score" => 1.0}
assert Enum.at(result.keywords, 1) == %{"text" => "docs", "score" => 1.0}
assert Enum.at(result.keywords, 2) == %{"text" => "ebook", "score" => 1.0}
assert Enum.at(result.keywords, 3) == %{"text" => "conversion", "score" => 1.0}
end

test "parses keyword string with extra whitespace" do
keywords_string = " keyword1 , keyword2 , keyword3 "

result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)

assert is_list(result.keywords)
assert length(result.keywords) == 3

# Verify whitespace is properly trimmed
assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0}
assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0}
assert Enum.at(result.keywords, 2) == %{"text" => "keyword3", "score" => 1.0}
end

test "handles keyword string with trailing/leading commas" do
keywords_string = ",keyword1,keyword2,"

result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)

assert is_list(result.keywords)
# Empty strings from leading/trailing commas should be filtered out
assert length(result.keywords) == 2
assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0}
assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0}
end

test "handles empty keyword string" do
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "")

assert result.keywords == []
end

test "handles keyword string with only whitespace" do
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: " ")

assert result.keywords == []
end

test "handles single keyword in string" do
result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "single")

assert is_list(result.keywords)
assert length(result.keywords) == 1
assert Enum.at(result.keywords, 0) == %{"text" => "single", "score" => 1.0}
end

test "assigns default score of 1.0 to parsed keywords" do
# Keywords from DOCX metadata don't have scores, so we assign default 1.0
keywords_string = "keyword1, keyword2"

result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)

Enum.each(result.keywords, fn keyword ->
assert keyword["score"] == 1.0
end)
end

test "preserves keyword order from string" do
keywords_string = "first, second, third, fourth"

result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string)

assert Enum.at(result.keywords, 0)["text"] == "first"
assert Enum.at(result.keywords, 1)["text"] == "second"
assert Enum.at(result.keywords, 2)["text"] == "third"
assert Enum.at(result.keywords, 3)["text"] == "fourth"
end
end
end
Loading