@@ -128,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
128128 */
129129 @ Override
130130 public List <Document > get () {
131-
132131 var paragraphs = this .paragraphTextExtractor .flatten ();
133-
134- List <Document > documents = new ArrayList <>(paragraphs .size ());
135-
136- if (!CollectionUtils .isEmpty (paragraphs )) {
137- logger .info ("Start processing paragraphs from PDF" );
138- Iterator <Paragraph > itr = paragraphs .iterator ();
139-
140- var current = itr .next ();
141-
142- if (!itr .hasNext ()) {
143- documents .add (toDocument (current , current ));
144- }
145- else {
146- while (itr .hasNext ()) {
147- var next = itr .next ();
148- Document document = toDocument (current , next );
149- if (document != null && StringUtils .hasText (document .getText ())) {
150- documents .add (toDocument (current , next ));
151- }
152- current = next ;
153- }
132+ List <Document > documents = new ArrayList <>();
133+ if (CollectionUtils .isEmpty (paragraphs )) {
134+ return documents ;
135+ }
136+ logger .info ("Start processing paragraphs from PDF" );
137+ for (int i = 0 ; i < paragraphs .size (); i ++) {
138+ Paragraph from = paragraphs .get (i );
139+ Paragraph to = (i + 1 < paragraphs .size ()) ? paragraphs .get (i + 1 ) : from ;
140+ Document document = toDocument (from , to );
141+ if (document != null && StringUtils .hasText (document .getText ())) {
142+ documents .add (document );
154143 }
155144 }
156145 logger .info ("End processing paragraphs from PDF" );
@@ -174,7 +163,7 @@ protected Document toDocument(Paragraph from, Paragraph to) {
174163 protected void addMetadata (Paragraph from , Paragraph to , Document document ) {
175164 document .getMetadata ().put (METADATA_TITLE , from .title ());
176165 document .getMetadata ().put (METADATA_START_PAGE , from .startPageNumber ());
177- document .getMetadata ().put (METADATA_END_PAGE , to . startPageNumber ());
166+ document .getMetadata ().put (METADATA_END_PAGE , from . endPageNumber ());
178167 document .getMetadata ().put (METADATA_LEVEL , from .level ());
179168 document .getMetadata ().put (METADATA_FILE_NAME , this .resourceFileName );
180169 }
@@ -192,7 +181,7 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
192181 int startPage = fromParagraph .startPageNumber () - 1 ;
193182 int endPage = toParagraph .startPageNumber () - 1 ;
194183
195- if (endPage < 0 ) {
184+ if (fromParagraph == toParagraph || endPage < startPage ) {
196185 endPage = startPage ;
197186 }
198187
@@ -206,39 +195,32 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
206195 for (int pageNumber = startPage ; pageNumber <= endPage ; pageNumber ++) {
207196
208197 var page = this .document .getPage (pageNumber );
209-
210- int fromPosition = fromParagraph .position ();
211- int toPosition = toParagraph .position ();
212-
213- if (this .config .reversedParagraphPosition ) {
214- fromPosition = (int ) (page .getMediaBox ().getHeight () - fromPosition );
215- toPosition = (int ) (page .getMediaBox ().getHeight () - toPosition );
198+ float pageHeight = page .getMediaBox ().getHeight ();
199+
200+ int fromPos = fromParagraph .position ();
201+ int toPos = (fromParagraph != toParagraph ) ? toParagraph .position () : 0 ;
202+
203+ int x = (int ) page .getMediaBox ().getLowerLeftX ();
204+ int w = (int ) page .getMediaBox ().getWidth ();
205+ int y , h ;
206+
207+ if (pageNumber == startPage && pageNumber == endPage ) {
208+ y = toPos ;
209+ h = fromPos - toPos ;
210+ } else if (pageNumber == startPage ) {
211+ y = 0 ;
212+ h = fromPos ;
213+ } else if (pageNumber == endPage ) {
214+ y = toPos ;
215+ h = (int ) pageHeight - toPos ;
216+ } else {
217+ y = 0 ;
218+ h = (int ) pageHeight ;
216219 }
217220
218- int x0 = (int ) page .getMediaBox ().getLowerLeftX ();
219- int xW = (int ) page .getMediaBox ().getWidth ();
220-
221- int y0 = (int ) page .getMediaBox ().getLowerLeftY ();
222- int yW = (int ) page .getMediaBox ().getHeight ();
223-
224- if (pageNumber == startPage ) {
225- y0 = fromPosition ;
226- yW = (int ) page .getMediaBox ().getHeight () - y0 ;
227- }
228- if (pageNumber == endPage ) {
229- yW = toPosition - y0 ;
230- }
231-
232- if ((y0 + yW ) == (int ) page .getMediaBox ().getHeight ()) {
233- yW = yW - this .config .pageBottomMargin ;
234- }
235-
236- if (y0 == 0 ) {
237- y0 = y0 + this .config .pageTopMargin ;
238- yW = yW - this .config .pageTopMargin ;
239- }
221+ if (h < 0 ) h = 0 ;
240222
241- pdfTextStripper .addRegion ("pdfPageRegion" , new Rectangle (x0 , y0 , xW , yW ));
223+ pdfTextStripper .addRegion ("pdfPageRegion" , new Rectangle (x , y , w , h ));
242224 pdfTextStripper .extractRegions (page );
243225 var text = pdfTextStripper .getTextForRegion ("pdfPageRegion" );
244226 if (StringUtils .hasText (text )) {
0 commit comments