diff --git a/CHANGELOG.md b/CHANGELOG.md index 396de1c8c..f742d9eaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Total 32 public types now properly exported for IDE autocomplete and type checking - Resolves import failures where types were defined but not accessible +#### Elixir +- **DOCX keyword extraction**: Fixed `FunctionClauseError` when extracting DOCX files with keywords metadata ([#309](https://github.com/kreuzberg-dev/kreuzberg/issues/309)) + - DOCX extractor now parses comma-separated keyword strings into `Vec` and stores in typed `Metadata.keywords` field + - Added defensive string handling to `normalize_keywords/1` in Elixir binding + - Resolves crash when extracting DOCX files containing keywords in `cp:keywords` or `dc:subject` metadata fields + - Added comprehensive unit tests for keyword string parsing in both Rust and Elixir + --- ## [4.0.8] - 2026-01-17 diff --git a/crates/kreuzberg/src/extractors/docx.rs b/crates/kreuzberg/src/extractors/docx.rs index a7a8f56a4..95cf23fd7 100644 --- a/crates/kreuzberg/src/extractors/docx.rs +++ b/crates/kreuzberg/src/extractors/docx.rs @@ -182,6 +182,7 @@ impl DocumentExtractor for DocxExtractor { }; let mut metadata_map = std::collections::HashMap::new(); + let mut parsed_keywords: Option> = None; if let Ok(core) = office_metadata::extract_core_properties(&mut archive) { if let Some(title) = core.title { @@ -198,7 +199,14 @@ impl DocumentExtractor for DocxExtractor { metadata_map.insert("subject".to_string(), serde_json::Value::String(subject)); } if let Some(keywords) = core.keywords { - metadata_map.insert("keywords".to_string(), serde_json::Value::String(keywords)); + // Parse comma-separated keywords into Vec + parsed_keywords = Some( + keywords + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(), + ); } if let Some(description) = core.description { metadata_map.insert("description".to_string(), serde_json::Value::String(description)); @@ -296,6 +304,7 @@ impl DocumentExtractor for DocxExtractor { mime_type: mime_type.to_string(), metadata: Metadata { pages: page_structure, + keywords: parsed_keywords, additional: metadata_map, ..Default::default() }, diff --git a/crates/kreuzberg/tests/docx_metadata_extraction_test.rs b/crates/kreuzberg/tests/docx_metadata_extraction_test.rs index 6fb59b5f5..2eb57e715 100644 --- a/crates/kreuzberg/tests/docx_metadata_extraction_test.rs +++ b/crates/kreuzberg/tests/docx_metadata_extraction_test.rs @@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() { println!("✅ DOCX minimal metadata extraction test passed!"); } + +#[tokio::test] +async fn test_docx_keywords_extraction() { + // This test verifies that DOCX keywords metadata is properly parsed + // from comma-separated strings into Vec in Metadata.keywords + // + // Addresses GitHub issue #309: DOCX keyword extraction was returning + // strings instead of parsed keyword lists, causing FunctionClauseError + // in the Elixir binding. + + use std::io::Write; + use tempfile::NamedTempFile; + use zip::CompressionMethod; + use zip::write::{FileOptions, ZipWriter}; + + // Create a minimal DOCX with keywords metadata + let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); + + { + let mut zip = ZipWriter::new(&mut temp_file); + let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); + + // Add [Content_Types].xml + zip.start_file("[Content_Types].xml", options).unwrap(); + zip.write_all(br#" + + + + + +"#).unwrap(); + + // Add _rels/.rels + zip.start_file("_rels/.rels", options).unwrap(); + zip.write_all(br#" + + + +"#).unwrap(); + + // Add word/document.xml with simple content + zip.start_file("word/document.xml", options).unwrap(); + zip.write_all( + br#" + + + + + Test document for keyword extraction + + + +"#, + ) + .unwrap(); + + // Add docProps/core.xml with keywords (comma-separated string) + zip.start_file("docProps/core.xml", options).unwrap(); + zip.write_all( + br#" + + Test Document + Test Author + rust, docx, extraction, metadata, test + Testing keyword extraction +"#, + ) + .unwrap(); + + zip.finish().unwrap(); + } + + // Extract the DOCX file + let result = extract_file( + temp_file.path(), + Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + &ExtractionConfig::default(), + ) + .await + .expect("Should extract DOCX with keywords successfully"); + + // Verify content was extracted + assert!(!result.content.is_empty(), "Content should not be empty"); + assert!( + result.content.contains("Test document for keyword extraction"), + "Content should match document text" + ); + + // Verify keywords were parsed into Vec in Metadata.keywords + assert!( + result.metadata.keywords.is_some(), + "Keywords should be present in metadata.keywords" + ); + + let keywords = result.metadata.keywords.as_ref().unwrap(); + assert_eq!( + keywords.len(), + 5, + "Should have 5 keywords parsed from comma-separated string" + ); + + // Verify individual keywords were trimmed and parsed correctly + assert_eq!(keywords[0], "rust", "First keyword should be 'rust'"); + assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'"); + assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'"); + assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'"); + assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'"); + + // Verify other metadata was also extracted + assert_eq!( + result.metadata.additional.get("created_by").and_then(|v| v.as_str()), + Some("Test Author"), + "Should have correct creator" + ); + assert_eq!( + result.metadata.additional.get("title").and_then(|v| v.as_str()), + Some("Test Document"), + "Should have correct title" + ); + assert_eq!( + result.metadata.additional.get("subject").and_then(|v| v.as_str()), + Some("Testing keyword extraction"), + "Should have correct subject" + ); + + println!("✅ DOCX keywords extraction test passed!"); + println!(" Extracted keywords: {:?}", keywords); +} diff --git a/packages/elixir/lib/kreuzberg/result.ex b/packages/elixir/lib/kreuzberg/result.ex index 49f82a9a9..ec12f737d 100644 --- a/packages/elixir/lib/kreuzberg/result.ex +++ b/packages/elixir/lib/kreuzberg/result.ex @@ -262,11 +262,19 @@ defmodule Kreuzberg.ExtractionResult do end @doc false - @spec normalize_keywords(list() | nil) :: list(map()) | nil + @spec normalize_keywords(list() | String.t() | nil) :: list(map()) | nil defp normalize_keywords(nil), do: nil defp normalize_keywords([]), do: [] defp normalize_keywords(keywords) when is_list(keywords), do: keywords + defp normalize_keywords(keywords) when is_binary(keywords) do + keywords + |> String.split(",") + |> Enum.map(&String.trim/1) + |> Enum.reject(&(&1 == "")) + |> Enum.map(fn text -> %{"text" => text, "score" => 1.0} end) + end + @doc false @spec normalize_elements(list() | nil) :: list(Kreuzberg.Element.t()) | nil defp normalize_elements(nil), do: nil diff --git a/packages/elixir/test/unit/extraction_result_test.exs b/packages/elixir/test/unit/extraction_result_test.exs index fb2c2faea..96ed88876 100644 --- a/packages/elixir/test/unit/extraction_result_test.exs +++ b/packages/elixir/test/unit/extraction_result_test.exs @@ -407,4 +407,91 @@ defmodule KreuzbergTest.Unit.ExtractionResultTest do assert Enum.uniq(contents) == contents end end + + describe "normalize_keywords/1 - keyword string parsing (GitHub issue #309)" do + test "parses comma-separated keyword string from DOCX metadata" do + # This test addresses GitHub issue #309: DOCX files return keywords + # as comma-separated strings from metadata, which caused FunctionClauseError + # before the fix was implemented. + keywords_string = "calibre, docs, ebook, conversion" + + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string) + + assert is_list(result.keywords) + assert length(result.keywords) == 4 + + assert Enum.at(result.keywords, 0) == %{"text" => "calibre", "score" => 1.0} + assert Enum.at(result.keywords, 1) == %{"text" => "docs", "score" => 1.0} + assert Enum.at(result.keywords, 2) == %{"text" => "ebook", "score" => 1.0} + assert Enum.at(result.keywords, 3) == %{"text" => "conversion", "score" => 1.0} + end + + test "parses keyword string with extra whitespace" do + keywords_string = " keyword1 , keyword2 , keyword3 " + + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string) + + assert is_list(result.keywords) + assert length(result.keywords) == 3 + + # Verify whitespace is properly trimmed + assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0} + assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0} + assert Enum.at(result.keywords, 2) == %{"text" => "keyword3", "score" => 1.0} + end + + test "handles keyword string with trailing/leading commas" do + keywords_string = ",keyword1,keyword2," + + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string) + + assert is_list(result.keywords) + # Empty strings from leading/trailing commas should be filtered out + assert length(result.keywords) == 2 + assert Enum.at(result.keywords, 0) == %{"text" => "keyword1", "score" => 1.0} + assert Enum.at(result.keywords, 1) == %{"text" => "keyword2", "score" => 1.0} + end + + test "handles empty keyword string" do + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "") + + assert result.keywords == [] + end + + test "handles keyword string with only whitespace" do + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: " ") + + assert result.keywords == [] + end + + test "handles single keyword in string" do + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: "single") + + assert is_list(result.keywords) + assert length(result.keywords) == 1 + assert Enum.at(result.keywords, 0) == %{"text" => "single", "score" => 1.0} + end + + test "assigns default score of 1.0 to parsed keywords" do + # Keywords from DOCX metadata don't have scores, so we assign default 1.0 + keywords_string = "keyword1, keyword2" + + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string) + + Enum.each(result.keywords, fn keyword -> + assert keyword["score"] == 1.0 + end) + end + + test "preserves keyword order from string" do + keywords_string = "first, second, third, fourth" + + result = ExtractionResult.new("content", "text/plain", %{}, [], keywords: keywords_string) + + assert Enum.at(result.keywords, 0)["text"] == "first" + assert Enum.at(result.keywords, 1)["text"] == "second" + assert Enum.at(result.keywords, 2)["text"] == "third" + assert Enum.at(result.keywords, 3)["text"] == "fourth" + end + end end