1818
1919import org .junit .jupiter .params .ParameterizedTest ;
2020import org .junit .jupiter .params .provider .CsvSource ;
21-
21+ import org . springframework . ai . reader . ExtractedTextFormatter ;
2222import static org .assertj .core .api .Assertions .assertThat ;
23+ import static org .junit .jupiter .api .Assertions .assertFalse ;
24+ import static org .junit .jupiter .api .Assertions .assertTrue ;
2325
2426/**
2527 * @author Christian Tzolov
28+ * @author Shahbaz Aamir
2629 */
2730public class TikaDocumentReaderTests {
2831
@@ -46,4 +49,26 @@ public void testDocx(String resourceUri, String resourceName, String contentSnip
4649 assertThat (doc .getText ()).contains (contentSnipped );
4750 }
4851
52+ @ ParameterizedTest
53+ @ CsvSource ({
54+ "classpath:/word-sample.docx,word-sample.docx,This document demonstrates the ability of the calibre DOCX Input plugin" ,
55+ "classpath:/sample2.pdf,sample2.pdf,Robert Maron" , "classpath:/sample.ppt,sample.ppt,Sample FILE" ,
56+ "classpath:/sample.pptx,sample.pptx,Sample FILE" })
57+ public void testReaderWithFormatter (String resourceUri , String resourceName , String contentSnipped ) {
58+
59+ ExtractedTextFormatter formatter = ExtractedTextFormatter .builder ().withNumberOfTopTextLinesToDelete (5 ).build ();
60+ var docs = new TikaDocumentReader (resourceUri , formatter ).get ();
61+
62+ assertThat (docs ).hasSize (1 );
63+
64+ var doc = docs .get (0 );
65+
66+ assertThat (doc .getMetadata ()).containsKeys (TikaDocumentReader .METADATA_SOURCE );
67+ assertThat (doc .getMetadata ().get (TikaDocumentReader .METADATA_SOURCE )).isEqualTo (resourceName );
68+ assertFalse (doc .getText ().contains (contentSnipped ));
69+ docs = new TikaDocumentReader (resourceUri ).get ();
70+ doc = docs .get (0 );
71+ assertThat (doc .getText ()).contains (contentSnipped );
72+ }
73+
4974}
0 commit comments