@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
120120
121121 println ! ( "✅ DOCX minimal metadata extraction test passed!" ) ;
122122}
123+
124+ #[ tokio:: test]
125+ async fn test_docx_keywords_extraction ( ) {
126+ // This test verifies that DOCX keywords metadata is properly parsed
127+ // from comma-separated strings into Vec<String> in Metadata.keywords
128+ //
129+ // Addresses GitHub issue #309: DOCX keyword extraction was returning
130+ // strings instead of parsed keyword lists, causing FunctionClauseError
131+ // in the Elixir binding.
132+
133+ use std:: io:: Write ;
134+ use tempfile:: NamedTempFile ;
135+ use zip:: CompressionMethod ;
136+ use zip:: write:: { FileOptions , ZipWriter } ;
137+
138+ // Create a minimal DOCX with keywords metadata
139+ let mut temp_file = NamedTempFile :: new ( ) . expect ( "Failed to create temp file" ) ;
140+
141+ {
142+ let mut zip = ZipWriter :: new ( & mut temp_file) ;
143+ let options: FileOptions < ( ) > = FileOptions :: default ( ) . compression_method ( CompressionMethod :: Stored ) ;
144+
145+ // Add [Content_Types].xml
146+ zip. start_file ( "[Content_Types].xml" , options) . unwrap ( ) ;
147+ zip. write_all ( br#"<?xml version="1.0" encoding="UTF-8"?>
148+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
149+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
150+ <Default Extension="xml" ContentType="application/xml"/>
151+ <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
152+ <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
153+ </Types>"# ) . unwrap ( ) ;
154+
155+ // Add _rels/.rels
156+ zip. start_file ( "_rels/.rels" , options) . unwrap ( ) ;
157+ zip. write_all ( br#"<?xml version="1.0" encoding="UTF-8"?>
158+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
161+ </Relationships>"# ) . unwrap ( ) ;
162+
163+ // Add word/document.xml with simple content
164+ zip. start_file ( "word/document.xml" , options) . unwrap ( ) ;
165+ zip. write_all (
166+ br#"<?xml version="1.0" encoding="UTF-8"?>
167+ <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
168+ <w:body>
169+ <w:p>
170+ <w:r>
171+ <w:t>Test document for keyword extraction</w:t>
172+ </w:r>
173+ </w:p>
174+ </w:body>
175+ </w:document>"# ,
176+ )
177+ . unwrap ( ) ;
178+
179+ // Add docProps/core.xml with keywords (comma-separated string)
180+ zip. start_file ( "docProps/core.xml" , options) . unwrap ( ) ;
181+ zip. write_all (
182+ br#"<?xml version="1.0" encoding="UTF-8"?>
183+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
184+ xmlns:dc="http://purl.org/dc/elements/1.1/"
185+ xmlns:dcterms="http://purl.org/dc/terms/">
186+ <dc:title>Test Document</dc:title>
187+ <dc:creator>Test Author</dc:creator>
188+ <cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
189+ <dc:subject>Testing keyword extraction</dc:subject>
190+ </cp:coreProperties>"# ,
191+ )
192+ . unwrap ( ) ;
193+
194+ zip. finish ( ) . unwrap ( ) ;
195+ }
196+
197+ // Extract the DOCX file
198+ let result = extract_file (
199+ temp_file. path ( ) ,
200+ Some ( "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) ,
201+ & ExtractionConfig :: default ( ) ,
202+ )
203+ . await
204+ . expect ( "Should extract DOCX with keywords successfully" ) ;
205+
206+ // Verify content was extracted
207+ assert ! ( !result. content. is_empty( ) , "Content should not be empty" ) ;
208+ assert ! (
209+ result. content. contains( "Test document for keyword extraction" ) ,
210+ "Content should match document text"
211+ ) ;
212+
213+ // Verify keywords were parsed into Vec<String> in Metadata.keywords
214+ assert ! (
215+ result. metadata. keywords. is_some( ) ,
216+ "Keywords should be present in metadata.keywords"
217+ ) ;
218+
219+ let keywords = result. metadata . keywords . as_ref ( ) . unwrap ( ) ;
220+ assert_eq ! (
221+ keywords. len( ) ,
222+ 5 ,
223+ "Should have 5 keywords parsed from comma-separated string"
224+ ) ;
225+
226+ // Verify individual keywords were trimmed and parsed correctly
227+ assert_eq ! ( keywords[ 0 ] , "rust" , "First keyword should be 'rust'" ) ;
228+ assert_eq ! ( keywords[ 1 ] , "docx" , "Second keyword should be 'docx'" ) ;
229+ assert_eq ! ( keywords[ 2 ] , "extraction" , "Third keyword should be 'extraction'" ) ;
230+ assert_eq ! ( keywords[ 3 ] , "metadata" , "Fourth keyword should be 'metadata'" ) ;
231+ assert_eq ! ( keywords[ 4 ] , "test" , "Fifth keyword should be 'test'" ) ;
232+
233+ // Verify other metadata was also extracted
234+ assert_eq ! (
235+ result. metadata. additional. get( "created_by" ) . and_then( |v| v. as_str( ) ) ,
236+ Some ( "Test Author" ) ,
237+ "Should have correct creator"
238+ ) ;
239+ assert_eq ! (
240+ result. metadata. additional. get( "title" ) . and_then( |v| v. as_str( ) ) ,
241+ Some ( "Test Document" ) ,
242+ "Should have correct title"
243+ ) ;
244+ assert_eq ! (
245+ result. metadata. additional. get( "subject" ) . and_then( |v| v. as_str( ) ) ,
246+ Some ( "Testing keyword extraction" ) ,
247+ "Should have correct subject"
248+ ) ;
249+
250+ println ! ( "✅ DOCX keywords extraction test passed!" ) ;
251+ println ! ( " Extracted keywords: {:?}" , keywords) ;
252+ }
0 commit comments