Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 4 additions & 0 deletions document-readers/pdf-reader/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
</scm>

<properties>
<disable.checks>false</disable.checks>
</properties>

<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package org.springframework.ai.reader.pdf;

import java.awt.*;
import java.awt.Rectangle;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -112,7 +112,7 @@ public List<Document> get() {
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
this.logger.info("Processing PDF page: {}", (counter + 1));
logger.info("Processing PDF page: {}", (counter + 1));
}
counter++;

Expand Down Expand Up @@ -154,7 +154,7 @@ public List<Document> get() {
readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
startPageNumber, pageNumber));
}
this.logger.info("Processing {} pages", totalPages);
logger.info("Processing {} pages", totalPages);
return readDocuments;

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package org.springframework.ai.reader.pdf;

import java.awt.*;
import java.awt.Rectangle;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -133,7 +133,7 @@ public List<Document> get() {
List<Document> documents = new ArrayList<>(paragraphs.size());

if (!CollectionUtils.isEmpty(paragraphs)) {
this.logger.info("Start processing paragraphs from PDF");
logger.info("Start processing paragraphs from PDF");
Iterator<Paragraph> itr = paragraphs.iterator();

var current = itr.next();
Expand All @@ -152,7 +152,7 @@ public List<Document> get() {
}
}
}
this.logger.info("End processing paragraphs from PDF");
logger.info("End processing paragraphs from PDF");
return documents;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ public void registerHints(RuntimeHints hints, ClassLoader classLoader) {
"/org/apache/pdfbox/resources/icc/**", "/org/apache/pdfbox/resources/text/**",
"/org/apache/pdfbox/resources/ttf/**", "/org/apache/pdfbox/resources/version.properties");

for (var pattern : patterns)
for (var resourceMatch : resolver.getResources(pattern))
for (var pattern : patterns) {
for (var resourceMatch : resolver.getResources(pattern)) {
hints.resources().registerResource(resourceMatch);
}
}

}
catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
*
* @author Christian Tzolov
*/
public class PdfDocumentReaderConfig {
public final class PdfDocumentReaderConfig {

public static final int ALL_PAGES = 0;

Expand Down Expand Up @@ -65,7 +65,7 @@ public static PdfDocumentReaderConfig defaultConfig() {
return builder().build();
}

public static class Builder {
public static final class Builder {

private int pagesPerDocument = 1;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.reader.pdf.layout;

class Character {

private char characterValue;

private int index;

private boolean isCharacterPartOfPreviousWord;

private boolean isFirstCharacterOfAWord;

private boolean isCharacterAtTheBeginningOfNewLine;

private boolean isCharacterCloseToPreviousWord;

Character(char characterValue, int index, boolean isCharacterPartOfPreviousWord, boolean isFirstCharacterOfAWord,
boolean isCharacterAtTheBeginningOfNewLine, boolean isCharacterPartOfASentence) {
this.characterValue = characterValue;
this.index = index;
this.isCharacterPartOfPreviousWord = isCharacterPartOfPreviousWord;
this.isFirstCharacterOfAWord = isFirstCharacterOfAWord;
this.isCharacterAtTheBeginningOfNewLine = isCharacterAtTheBeginningOfNewLine;
this.isCharacterCloseToPreviousWord = isCharacterPartOfASentence;
if (ForkPDFLayoutTextStripper.DEBUG) {
System.out.println(this.toString());
}
}

public char getCharacterValue() {
return this.characterValue;
}

public int getIndex() {
return this.index;
}

public void setIndex(int index) {
this.index = index;
}

public boolean isCharacterPartOfPreviousWord() {
return this.isCharacterPartOfPreviousWord;
}

public boolean isFirstCharacterOfAWord() {
return this.isFirstCharacterOfAWord;
}

public boolean isCharacterAtTheBeginningOfNewLine() {
return this.isCharacterAtTheBeginningOfNewLine;
}

public boolean isCharacterCloseToPreviousWord() {
return this.isCharacterCloseToPreviousWord;
}

public String toString() {
String toString = "";
toString += this.index;
toString += " ";
toString += this.characterValue;
toString += " isCharacterPartOfPreviousWord=" + this.isCharacterPartOfPreviousWord;
toString += " isFirstCharacterOfAWord=" + this.isFirstCharacterOfAWord;
toString += " isCharacterAtTheBeginningOfNewLine=" + this.isCharacterAtTheBeginningOfNewLine;
toString += " isCharacterPartOfASentence=" + this.isCharacterCloseToPreviousWord;
toString += " isCharacterCloseToPreviousWord=" + this.isCharacterCloseToPreviousWord;
return toString;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.reader.pdf.layout;

import org.apache.pdfbox.text.TextPosition;

class CharacterFactory {

private TextPosition previousTextPosition;

private boolean firstCharacterOfLineFound;

private boolean isCharacterPartOfPreviousWord;

private boolean isFirstCharacterOfAWord;

private boolean isCharacterAtTheBeginningOfNewLine;

private boolean isCharacterCloseToPreviousWord;

CharacterFactory(boolean firstCharacterOfLineFound) {
this.firstCharacterOfLineFound = firstCharacterOfLineFound;
}

public Character createCharacterFromTextPosition(final TextPosition textPosition,
final TextPosition previousTextPosition) {
this.setPreviousTextPosition(previousTextPosition);
this.isCharacterPartOfPreviousWord = this.isCharacterPartOfPreviousWord(textPosition);
this.isFirstCharacterOfAWord = this.isFirstCharacterOfAWord(textPosition);
this.isCharacterAtTheBeginningOfNewLine = this.isCharacterAtTheBeginningOfNewLine(textPosition);
this.isCharacterCloseToPreviousWord = this.isCharacterCloseToPreviousWord(textPosition);
char character = this.getCharacterFromTextPosition(textPosition);
int index = (int) textPosition.getX() / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
return new Character(character, index, this.isCharacterPartOfPreviousWord, this.isFirstCharacterOfAWord,
this.isCharacterAtTheBeginningOfNewLine, this.isCharacterCloseToPreviousWord);
}

private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPosition) {
if (!this.firstCharacterOfLineFound) {
return true;
}
TextPosition previousTextPosition = this.getPreviousTextPosition();
float previousTextYPosition = previousTextPosition.getY();
return (Math.round(textPosition.getY()) < Math.round(previousTextYPosition));
}

private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
if (!this.firstCharacterOfLineFound) {
return true;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(this.previousTextPosition, textPosition);
return (numberOfSpaces > 1) || this.isCharacterAtTheBeginningOfNewLine(textPosition);
}

private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition) {
if (!this.firstCharacterOfLineFound) {
return false;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(this.previousTextPosition, textPosition);
return (numberOfSpaces > 1 && numberOfSpaces <= ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT);
}

private boolean isCharacterPartOfPreviousWord(final TextPosition textPosition) {
TextPosition previousTextPosition = this.getPreviousTextPosition();
if (previousTextPosition.getUnicode().equals(" ")) {
return false;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
return (numberOfSpaces <= 1);
}

private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPosition1,
final TextPosition textPosition2) {
double previousTextXPosition = textPosition1.getX();
double previousTextWidth = textPosition1.getWidth();
double previousTextEndXPosition = (previousTextXPosition + previousTextWidth);
double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition));
return numberOfSpaces;
}

private char getCharacterFromTextPosition(final TextPosition textPosition) {
String string = textPosition.getUnicode();
char character = string.charAt(0);
return character;
}

private TextPosition getPreviousTextPosition() {
return this.previousTextPosition;
}

private void setPreviousTextPosition(final TextPosition previousTextPosition) {
this.previousTextPosition = previousTextPosition;
}

}
Loading