Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ pom.xml.tag
pom.xml.releaseBackup
pom.xml.next
release.properties
_Docs/result/**
14 changes: 7 additions & 7 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.giaybac</groupId>
Expand All @@ -8,33 +8,33 @@
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<maven.compiler.source>1.11</maven.compiler.source>
<maven.compiler.target>1.11</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
<version>30.1-jre</version>
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.6</version>
<version>2.0.22</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.12</version>
<version>2.0.0-alpha1</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.12</version>
<version>2.0.0-alpha1</version>
</dependency>

<dependency>
Expand Down
8 changes: 1 addition & 7 deletions src/main/java/com/giaybac/traprange/MAIN.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,11 @@ private static void extractTables(String[] args) {
//begin parsing pdf file
List<Table> tables = extractor.extract();

Writer writer = new OutputStreamWriter(new FileOutputStream(out), "UTF-8");
try {
try (Writer writer = new OutputStreamWriter(new FileOutputStream(out), "UTF-8")) {
for (Table table : tables) {
writer.write("Page: " + (table.getPageIdx() + 1) + "\n");
writer.write(table.toHtml());
}
} finally {
try {
writer.close();
} catch (Exception e) {
}
}
} catch (Exception e) {
logger.error(null, e);
Expand Down
87 changes: 30 additions & 57 deletions src/main/java/com/giaybac/traprange/PDFTableExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,18 @@
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.logging.Level;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

/**
*
* @author Tho Mar 22, 2015 3:34:29 PM
Expand Down Expand Up @@ -224,18 +214,7 @@ private Table buildTable(int pageIdx, List<TextPosition> tableContent,
private TableRow buildRow(int rowIdx, List<TextPosition> rowContent, List<Range<Integer>> columnTrapRanges) {
TableRow retVal = new TableRow(rowIdx);
//Sort rowContent
Collections.sort(rowContent, new Comparator<TextPosition>() {
@Override
public int compare(TextPosition o1, TextPosition o2) {
int retVal = 0;
if (o1.getX() < o2.getX()) {
retVal = -1;
} else if (o1.getX() > o2.getX()) {
retVal = 1;
}
return retVal;
}
});
sortCellContent(rowContent);
int idx = 0;
int columnIdx = 0;
List<TextPosition> cellContent = new ArrayList<>();
Expand Down Expand Up @@ -264,18 +243,7 @@ public int compare(TextPosition o1, TextPosition o2) {
}

private TableCell buildCell(int columnIdx, List<TextPosition> cellContent) {
Collections.sort(cellContent, new Comparator<TextPosition>() {
@Override
public int compare(TextPosition o1, TextPosition o2) {
int retVal = 0;
if (o1.getX() < o2.getX()) {
retVal = -1;
} else if (o1.getX() > o2.getX()) {
retVal = 1;
}
return retVal;
}
});
sortCellContent(cellContent);
//String cellContentString = Joiner.on("").join(cellContent.stream().map(e -> e.getCharacter()).iterator());
StringBuilder cellContentBuilder = new StringBuilder();
for (TextPosition textPosition : cellContent) {
Expand All @@ -285,15 +253,26 @@ public int compare(TextPosition o1, TextPosition o2) {
return new TableCell(columnIdx, cellContentString);
}

private static void sortCellContent(List<TextPosition> cellContent) {
Collections.sort(cellContent, (o1, o2) -> {
int retVal = 0;
if (o1.getX() < o2.getX()) {
retVal = -1;
} else if (o1.getX() > o2.getX()) {
retVal = 1;
}
return retVal;
});
}

private List<TextPosition> extractTextPositions(int pageId) throws IOException {
TextPositionExtractor extractor = new TextPositionExtractor(document, pageId);
return extractor.extract();
}

private boolean isExceptedLine(int pageIdx, int lineIdx) {
boolean retVal = this.pageNExceptedLinesMap.containsEntry(pageIdx, lineIdx)
return this.pageNExceptedLinesMap.containsEntry(pageIdx, lineIdx)
|| this.pageNExceptedLinesMap.containsEntry(-1, lineIdx);
return retVal;
}

/**
Expand Down Expand Up @@ -324,7 +303,6 @@ private List<TextPosition> getTextsByLineRanges(List<Range<Integer>> lineRanges,
idx++;
}
}
//return
return retVal;
}

Expand All @@ -350,8 +328,7 @@ private List<Range<Integer>> getLineRanges(int pageId, List<TextPosition> pageCo
lineTrapRangeBuilder.addRange(lineRange);
}
List<Range<Integer>> lineTrapRanges = lineTrapRangeBuilder.build();
List<Range<Integer>> retVal = removeExceptedLines(pageId, lineTrapRanges);
return retVal;
return removeExceptedLines(pageId, lineTrapRanges);
}

private List<Range<Integer>> removeExceptedLines(int pageIdx, List<Range<Integer>> lineTrapRanges) {
Expand All @@ -363,7 +340,6 @@ private List<Range<Integer>> removeExceptedLines(int pageIdx, List<Range<Integer
retVal.add(lineTrapRanges.get(lineIdx));
}
}
//return
return retVal;
}

Expand Down Expand Up @@ -401,18 +377,15 @@ protected void writeString(String string, List<TextPosition> textPositions) thro
private List<TextPosition> extract() throws IOException {
this.stripPage(pageId);
//sort
Collections.sort(textPositions, new Comparator<TextPosition>() {
@Override
public int compare(TextPosition o1, TextPosition o2) {
int retVal = 0;
if (o1.getY() < o2.getY()) {
retVal = -1;
} else if (o1.getY() > o2.getY()) {
retVal = 1;
}
return retVal;

Collections.sort(textPositions, (o1, o2) -> {
int retVal = 0;
if (o1.getY() < o2.getY()) {
retVal = -1;
} else if (o1.getY() > o2.getY()) {
retVal = 1;
}
return retVal;

});
return this.textPositions;
}
Expand Down
7 changes: 1 addition & 6 deletions src/main/java/com/giaybac/traprange/TrapRangeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,7 @@ public TrapRangeBuilder addRange(Range<Integer> range) {
public List<Range<Integer>> build() {
List<Range<Integer>> retVal = new ArrayList<>();
//order range by lower Bound
Collections.sort(ranges, new Comparator<Range>() {
@Override
public int compare(Range o1, Range o2) {
return o1.lowerEndpoint().compareTo(o2.lowerEndpoint());
}
});
Collections.sort(ranges, (Comparator<Range>) (o1, o2) -> o1.lowerEndpoint().compareTo(o2.lowerEndpoint()));

for (Range<Integer> range : ranges) {
if (retVal.isEmpty()) {
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/com/giaybac/traprange/entity/TableRow.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ public String toString() {
StringBuilder retVal = new StringBuilder();
int lastCellIdx = 0;
for (TableCell cell : cells) {
for (int idx2 = lastCellIdx; idx2 < cell.getIdx() - 1; idx2++) {
retVal.append(";");
}
retVal.append(";".repeat(Math.max(0, cell.getIdx() - 1 - lastCellIdx)));
if (cell.getIdx() > 0) {
retVal.append(";");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@

/**
* Java doc to be completed
*
*
* @author Jonathan Link
*
*
*/
public class PDFLayoutTextStripper extends PDFTextStripper {

Expand All @@ -82,7 +82,7 @@ public PDFLayoutTextStripper() throws IOException {
}

/**
*
*
* @param page page to parse
*/
@Override
Expand All @@ -92,21 +92,20 @@ public void processPage(PDPage page) throws IOException {
this.setCurrentPageWidth(pageRectangle.getWidth());
super.processPage(page);
this.previousTextPosition = null;
this.textLineList = new ArrayList<TextLine>();
this.textLineList = new ArrayList<>();
}
}

@Override
protected void writePage() throws IOException {
List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
for( int i = 0; i < charactersByArticle.size(); i++) {
List<TextPosition> textList = charactersByArticle.get(i);
for (List<TextPosition> textList : charactersByArticle) {
try {
this.sortTextPositionList(textList);
} catch ( java.lang.IllegalArgumentException e) {
} catch (IllegalArgumentException e) {
System.err.println(e);
}
this.iterateThroughTextList(textList.iterator()) ;
this.iterateThroughTextList(textList.iterator());
}
this.writeToOutputStream(this.getTextLineList());
}
Expand Down Expand Up @@ -150,15 +149,13 @@ private void iterateThroughTextList(Iterator<TextPosition> textIterator) {
List<TextPosition> textPositionList = new ArrayList<TextPosition>();

while ( textIterator.hasNext() ) {
TextPosition textPosition = (TextPosition)textIterator.next();
TextPosition textPosition = textIterator.next();
int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition);
if ( numberOfNewLines == 0 ) {
textPositionList.add(textPosition);
} else {
if (numberOfNewLines != 0) {
this.writeTextPositionList(textPositionList);
this.createNewEmptyNewLines(numberOfNewLines);
textPositionList.add(textPosition);
}
textPositionList.add(textPosition);
this.setPreviousTextPosition(textPosition);
}
if (!textPositionList.isEmpty()) {
Expand Down Expand Up @@ -459,14 +456,12 @@ private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPositio
double previousTextXPosition = textPosition1.getX();
double previousTextWidth = textPosition1.getWidth();
double previousTextEndXPosition = (previousTextXPosition + previousTextWidth);
double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition));
return numberOfSpaces;
return (double) Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition));
}

private char getCharacterFromTextPosition(final TextPosition textPosition) {
String string = textPosition.getUnicode();
char character = string.charAt(0);
return character;
return string.charAt(0);
}

private TextPosition getPreviousTextPosition() {
Expand Down
4 changes: 3 additions & 1 deletion src/test/java/com/giaybac/traprange/test/TestInvoice2.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.*;
import java.util.stream.Collectors;

/**
* How to run this file:
Expand Down Expand Up @@ -48,7 +49,8 @@ public void test() throws IOException {
"(\\d+\\/\\d+\\/\\d+\\s\\d+\\:\\d+)",
"([a-z0-9A-Z]{5,})"
};
String patternString = "\\s+" + String.join("\\s+", patternStrings) + "\\s+";

String patternString = "\\s+" + Arrays.asList(patternStrings).stream().collect(Collectors.joining("\\s+"))+ "\\s+";
Pattern p = Pattern.compile(patternString);
for (String line : lines) {
Matcher matcher = p.matcher(line);
Expand Down
10 changes: 5 additions & 5 deletions test-command-line.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@ rm -rf $home/result/*

# Sample 1
idx=1
java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1,-1"
java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1,-1"

# Sample 2
idx=2
java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1"
java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1"

# Sample 3
idx=3
java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -ep "0"
java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -ep "0"

# Sample 4
idx=4
java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0"
java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0"

# Sample 5
idx=5
java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0@0,1@0"
java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0@0,1@0"