@@ -14,7 +14,9 @@ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
1414use kreuzberg:: types:: ExtractionResult ;
1515use kreuzberg:: { KreuzbergError , Result } ;
1616
17- use super :: common:: { json_value_to_py, python_to_json, validate_plugin_object} ;
17+ use crate :: types:: ExtractionResult as PyExtractionResult ;
18+
19+ use super :: common:: { python_to_json, validate_plugin_object} ;
1820
1921/// Wrapper that makes a Python PostProcessor usable from Rust.
2022///
@@ -133,26 +135,29 @@ impl PostProcessor for PythonPostProcessor {
133135 Python :: attach ( |py| {
134136 let obj = self . python_obj . bind ( py) ;
135137
136- let result_dict = extraction_result_to_dict ( py, result) . map_err ( |e| KreuzbergError :: Plugin {
137- message : format ! ( "Failed to convert ExtractionResult to Python dict: {}" , e) ,
138+ // Convert Rust ExtractionResult to Python ExtractionResult class instance
139+ let py_extraction_result =
140+ PyExtractionResult :: from_rust ( result. clone ( ) , py, None , None ) . map_err ( |e| {
141+ KreuzbergError :: Plugin {
142+ message : format ! ( "Failed to convert ExtractionResult to Python: {}" , e) ,
143+ plugin_name : processor_name. clone ( ) ,
144+ }
145+ } ) ?;
146+
147+ let py_result_obj = Py :: new ( py, py_extraction_result) . map_err ( |e| KreuzbergError :: Plugin {
148+ message : format ! ( "Failed to create Python ExtractionResult: {}" , e) ,
138149 plugin_name : processor_name. clone ( ) ,
139150 } ) ?;
140151
141- let py_result = result_dict. bind ( py) ;
142152 let processed = obj
143- . call_method1 ( "process" , ( py_result , ) )
153+ . call_method1 ( "process" , ( py_result_obj , ) )
144154 . map_err ( |e| KreuzbergError :: Plugin {
145155 message : format ! ( "Python PostProcessor '{}' failed during process: {}" , processor_name, e) ,
146156 plugin_name : processor_name. clone ( ) ,
147157 } ) ?;
148158
149- let processed_dict = processed. cast_into :: < PyDict > ( ) . map_err ( |e| KreuzbergError :: Plugin {
150- message : format ! ( "PostProcessor did not return a dict: {}" , e) ,
151- plugin_name : processor_name. clone ( ) ,
152- } ) ?;
153-
154159 let mut updated_result = result. clone ( ) ;
155- merge_dict_to_extraction_result ( py, & processed_dict , & mut updated_result) ?;
160+ merge_processed_result ( py, & processed , & mut updated_result) ?;
156161
157162 Ok :: < ExtractionResult , KreuzbergError > ( updated_result)
158163 } )
@@ -167,67 +172,45 @@ impl PostProcessor for PythonPostProcessor {
167172 }
168173}
169174
170- /// Convert Rust ExtractionResult to Python dict .
175+ /// Merge a processed Python result back into a Rust ExtractionResult .
171176///
172- /// This creates a Python dict that can be passed to Python processors:
173- /// ```python
174- /// {
175- /// "content": "extracted text",
176- /// "mime_type": "application/pdf",
177- /// "metadata": {"key": "value"},
178- /// "tables": [...]
179- /// }
180- /// ```
181- fn extraction_result_to_dict ( py : Python < ' _ > , result : & ExtractionResult ) -> PyResult < Py < PyDict > > {
182- let dict = PyDict :: new ( py) ;
183-
184- dict. set_item ( "content" , & result. content ) ?;
185-
186- dict. set_item ( "mime_type" , & result. mime_type ) ?;
187-
188- let metadata_dict = PyDict :: new ( py) ;
189-
190- if let Some ( title) = & result. metadata . title {
191- metadata_dict. set_item ( "title" , title) ?;
192- }
193- if let Some ( subject) = & result. metadata . subject {
194- metadata_dict. set_item ( "subject" , subject) ?;
195- }
196- if let Some ( authors) = & result. metadata . authors {
197- metadata_dict. set_item ( "authors" , authors) ?;
198- }
199- if let Some ( keywords) = & result. metadata . keywords {
200- metadata_dict. set_item ( "keywords" , keywords) ?;
201- }
202- if let Some ( language) = & result. metadata . language {
203- metadata_dict. set_item ( "language" , language) ?;
204- }
205- if let Some ( created_at) = & result. metadata . created_at {
206- metadata_dict. set_item ( "created_at" , created_at) ?;
207- }
208- if let Some ( modified_at) = & result. metadata . modified_at {
209- metadata_dict. set_item ( "modified_at" , modified_at) ?;
210- }
211- if let Some ( created_by) = & result. metadata . created_by {
212- metadata_dict. set_item ( "created_by" , created_by) ?;
213- }
214- if let Some ( modified_by) = & result. metadata . modified_by {
215- metadata_dict. set_item ( "modified_by" , modified_by) ?;
216- }
217- if let Some ( created_at) = & result. metadata . created_at {
218- metadata_dict. set_item ( "created_at" , created_at) ?;
177+ /// Supports both ExtractionResult class instances (attribute access) and
178+ /// plain dicts (dict-style access) for backward compatibility.
179+ fn merge_processed_result ( py : Python < ' _ > , processed : & Bound < ' _ , PyAny > , result : & mut ExtractionResult ) -> Result < ( ) > {
180+ // If processor returned a dict, use dict-style access for backward compatibility
181+ if let Ok ( dict) = processed. cast :: < PyDict > ( ) {
182+ return merge_dict_to_extraction_result ( py, dict, result) ;
219183 }
220184
221- for ( key, value) in & result. metadata . additional {
222- let py_value = json_value_to_py ( py, value) ?;
223- metadata_dict. set_item ( key, py_value) ?;
185+ // Use attribute access (ExtractionResult or duck-typed object)
186+ if let Ok ( content) = processed. getattr ( "content" )
187+ && !content. is_none ( )
188+ {
189+ result. content = content. extract ( ) . map_err ( |e| KreuzbergError :: Plugin {
190+ message : format ! ( "PostProcessor returned invalid 'content': {}" , e) ,
191+ plugin_name : "python" . to_string ( ) ,
192+ } ) ?;
224193 }
225194
226- dict. set_item ( "metadata" , metadata_dict) ?;
195+ if let Ok ( metadata) = processed. getattr ( "metadata" )
196+ && !metadata. is_none ( )
197+ && let Ok ( meta_dict) = metadata. cast :: < PyDict > ( )
198+ {
199+ for ( key, value) in meta_dict. iter ( ) {
200+ let key_str: String = key. extract ( ) . map_err ( |_| KreuzbergError :: Plugin {
201+ message : "Metadata keys must be strings" . to_string ( ) ,
202+ plugin_name : "python" . to_string ( ) ,
203+ } ) ?;
227204
228- dict. set_item ( "tables" , pyo3:: types:: PyList :: empty ( py) ) ?;
205+ let json_value = python_to_json ( & value) ?;
206+ result
207+ . metadata
208+ . additional
209+ . insert ( std:: borrow:: Cow :: Owned ( key_str) , json_value) ;
210+ }
211+ }
229212
230- Ok ( dict . unbind ( ) )
213+ Ok ( ( ) )
231214}
232215
233216/// Merge Python dict back into ExtractionResult.
@@ -288,7 +271,7 @@ fn merge_dict_to_extraction_result(
288271///
289272/// The Python processor must implement:
290273/// - `name() -> str` - Return processor name
291- /// - `process(result: dict ) -> dict ` - Process and enrich the extraction result
274+ /// - `process(result: ExtractionResult ) -> ExtractionResult ` - Process and enrich the extraction result
292275///
293276/// # Optional Methods
294277///
@@ -300,7 +283,7 @@ fn merge_dict_to_extraction_result(
300283/// # Example
301284///
302285/// ```python
303- /// from kreuzberg import register_post_processor
286+ /// from kreuzberg import register_post_processor, ExtractionResult
304287///
305288/// class EntityExtractor:
306289/// def name(self) -> str:
@@ -309,10 +292,10 @@ fn merge_dict_to_extraction_result(
309292/// def processing_stage(self) -> str:
310293/// return "early"
311294///
312- /// def process(self, result: dict ) -> dict :
313- /// # Extract entities from result[" content"]
295+ /// def process(self, result: ExtractionResult ) -> ExtractionResult :
296+ /// # Extract entities from result. content
314297/// entities = {"PERSON": ["John Doe"], "ORG": ["Microsoft"]}
315- /// result[" metadata"] ["entities"] = entities
298+ /// result. metadata["entities"] = entities
316299/// return result
317300///
318301/// register_post_processor(EntityExtractor())
@@ -369,7 +352,7 @@ pub fn register_post_processor(py: Python<'_>, processor: Py<PyAny>) -> PyResult
369352/// def name(self) -> str:
370353/// return "my_processor"
371354///
372- /// def process(self, result: dict ) -> dict :
355+ /// def process(self, result: ExtractionResult ) -> ExtractionResult :
373356/// return result
374357///
375358/// register_post_processor(MyProcessor())
@@ -444,7 +427,7 @@ pub fn clear_post_processors(py: Python<'_>) -> PyResult<()> {
444427/// def name(self) -> str:
445428/// return "my_processor"
446429///
447- /// def process(self, result: dict ) -> dict :
430+ /// def process(self, result: ExtractionResult ) -> ExtractionResult :
448431/// return result
449432///
450433/// # Register processor
0 commit comments