diff --git a/.gitignore b/.gitignore index 94eab12..09b2dfd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ pom.xml.tag pom.xml.releaseBackup pom.xml.next release.properties +_Docs/result/** diff --git a/pom.xml b/pom.xml index 13bcf4a..422a62a 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,5 @@ - 4.0.0 com.giaybac @@ -8,33 +8,33 @@ jar UTF-8 - 1.7 - 1.7 + 1.11 + 1.11 com.google.guava guava - 18.0 + 30.1-jre org.apache.pdfbox pdfbox - 2.0.6 + 2.0.22 org.slf4j slf4j-api - 1.7.12 + 2.0.0-alpha1 org.slf4j slf4j-log4j12 - 1.7.12 + 2.0.0-alpha1 diff --git a/src/main/java/com/giaybac/traprange/MAIN.java b/src/main/java/com/giaybac/traprange/MAIN.java index 60f064a..39fe4f3 100644 --- a/src/main/java/com/giaybac/traprange/MAIN.java +++ b/src/main/java/com/giaybac/traprange/MAIN.java @@ -86,17 +86,11 @@ private static void extractTables(String[] args) { //begin parsing pdf file List tables = extractor.extract(); - Writer writer = new OutputStreamWriter(new FileOutputStream(out), "UTF-8"); - try { + try (Writer writer = new OutputStreamWriter(new FileOutputStream(out), "UTF-8")) { for (Table table : tables) { writer.write("Page: " + (table.getPageIdx() + 1) + "\n"); writer.write(table.toHtml()); } - } finally { - try { - writer.close(); - } catch (Exception e) { - } } } catch (Exception e) { logger.error(null, e); diff --git a/src/main/java/com/giaybac/traprange/PDFTableExtractor.java b/src/main/java/com/giaybac/traprange/PDFTableExtractor.java index 075fb57..8d7358b 100755 --- a/src/main/java/com/giaybac/traprange/PDFTableExtractor.java +++ b/src/main/java/com/giaybac/traprange/PDFTableExtractor.java @@ -12,28 +12,18 @@ import com.google.common.collect.LinkedListMultimap; import com.google.common.collect.Multimap; import com.google.common.collect.Range; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.logging.Level; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; + /** * * @author Tho Mar 22, 2015 3:34:29 PM @@ -224,18 +214,7 @@ private Table buildTable(int pageIdx, List tableContent, private TableRow buildRow(int rowIdx, List rowContent, List> columnTrapRanges) { TableRow retVal = new TableRow(rowIdx); //Sort rowContent - Collections.sort(rowContent, new Comparator() { - @Override - public int compare(TextPosition o1, TextPosition o2) { - int retVal = 0; - if (o1.getX() < o2.getX()) { - retVal = -1; - } else if (o1.getX() > o2.getX()) { - retVal = 1; - } - return retVal; - } - }); + sortCellContent(rowContent); int idx = 0; int columnIdx = 0; List cellContent = new ArrayList<>(); @@ -264,18 +243,7 @@ public int compare(TextPosition o1, TextPosition o2) { } private TableCell buildCell(int columnIdx, List cellContent) { - Collections.sort(cellContent, new Comparator() { - @Override - public int compare(TextPosition o1, TextPosition o2) { - int retVal = 0; - if (o1.getX() < o2.getX()) { - retVal = -1; - } else if (o1.getX() > o2.getX()) { - retVal = 1; - } - return retVal; - } - }); + sortCellContent(cellContent); //String cellContentString = Joiner.on("").join(cellContent.stream().map(e -> e.getCharacter()).iterator()); StringBuilder cellContentBuilder = new StringBuilder(); for (TextPosition textPosition : cellContent) { @@ -285,15 +253,26 @@ public int compare(TextPosition o1, TextPosition o2) { return new TableCell(columnIdx, cellContentString); } + private static void sortCellContent(List cellContent) { + Collections.sort(cellContent, (o1, o2) -> { + int retVal = 0; + if (o1.getX() < o2.getX()) { + retVal = -1; + } else if (o1.getX() > o2.getX()) { + retVal = 1; + } + return retVal; + }); + } + private List extractTextPositions(int pageId) throws IOException { TextPositionExtractor extractor = new TextPositionExtractor(document, pageId); return extractor.extract(); } private boolean isExceptedLine(int pageIdx, int lineIdx) { - boolean retVal = this.pageNExceptedLinesMap.containsEntry(pageIdx, lineIdx) + return this.pageNExceptedLinesMap.containsEntry(pageIdx, lineIdx) || this.pageNExceptedLinesMap.containsEntry(-1, lineIdx); - return retVal; } /** @@ -324,7 +303,6 @@ private List getTextsByLineRanges(List> lineRanges, idx++; } } - //return return retVal; } @@ -350,8 +328,7 @@ private List> getLineRanges(int pageId, List pageCo lineTrapRangeBuilder.addRange(lineRange); } List> lineTrapRanges = lineTrapRangeBuilder.build(); - List> retVal = removeExceptedLines(pageId, lineTrapRanges); - return retVal; + return removeExceptedLines(pageId, lineTrapRanges); } private List> removeExceptedLines(int pageIdx, List> lineTrapRanges) { @@ -363,7 +340,6 @@ private List> removeExceptedLines(int pageIdx, List textPositions) thro private List extract() throws IOException { this.stripPage(pageId); //sort - Collections.sort(textPositions, new Comparator() { - @Override - public int compare(TextPosition o1, TextPosition o2) { - int retVal = 0; - if (o1.getY() < o2.getY()) { - retVal = -1; - } else if (o1.getY() > o2.getY()) { - retVal = 1; - } - return retVal; - + Collections.sort(textPositions, (o1, o2) -> { + int retVal = 0; + if (o1.getY() < o2.getY()) { + retVal = -1; + } else if (o1.getY() > o2.getY()) { + retVal = 1; } + return retVal; + }); return this.textPositions; } diff --git a/src/main/java/com/giaybac/traprange/TrapRangeBuilder.java b/src/main/java/com/giaybac/traprange/TrapRangeBuilder.java index f5704fc..18880af 100644 --- a/src/main/java/com/giaybac/traprange/TrapRangeBuilder.java +++ b/src/main/java/com/giaybac/traprange/TrapRangeBuilder.java @@ -34,12 +34,7 @@ public TrapRangeBuilder addRange(Range range) { public List> build() { List> retVal = new ArrayList<>(); //order range by lower Bound - Collections.sort(ranges, new Comparator() { - @Override - public int compare(Range o1, Range o2) { - return o1.lowerEndpoint().compareTo(o2.lowerEndpoint()); - } - }); + Collections.sort(ranges, (Comparator) (o1, o2) -> o1.lowerEndpoint().compareTo(o2.lowerEndpoint())); for (Range range : ranges) { if (retVal.isEmpty()) { diff --git a/src/main/java/com/giaybac/traprange/entity/TableRow.java b/src/main/java/com/giaybac/traprange/entity/TableRow.java index b7b1fa1..75c1469 100644 --- a/src/main/java/com/giaybac/traprange/entity/TableRow.java +++ b/src/main/java/com/giaybac/traprange/entity/TableRow.java @@ -33,9 +33,7 @@ public String toString() { StringBuilder retVal = new StringBuilder(); int lastCellIdx = 0; for (TableCell cell : cells) { - for (int idx2 = lastCellIdx; idx2 < cell.getIdx() - 1; idx2++) { - retVal.append(";"); - } + retVal.append(";".repeat(Math.max(0, cell.getIdx() - 1 - lastCellIdx))); if (cell.getIdx() > 0) { retVal.append(";"); } diff --git a/src/main/java/com/giaybac/traprange/invoice/PDFLayoutTextStripper.java b/src/main/java/com/giaybac/traprange/invoice/PDFLayoutTextStripper.java index 9e4ef40..cdd24dd 100644 --- a/src/main/java/com/giaybac/traprange/invoice/PDFLayoutTextStripper.java +++ b/src/main/java/com/giaybac/traprange/invoice/PDFLayoutTextStripper.java @@ -59,9 +59,9 @@ /** * Java doc to be completed -* +* * @author Jonathan Link -* +* */ public class PDFLayoutTextStripper extends PDFTextStripper { @@ -82,7 +82,7 @@ public PDFLayoutTextStripper() throws IOException { } /** - * + * * @param page page to parse */ @Override @@ -92,21 +92,20 @@ public void processPage(PDPage page) throws IOException { this.setCurrentPageWidth(pageRectangle.getWidth()); super.processPage(page); this.previousTextPosition = null; - this.textLineList = new ArrayList(); + this.textLineList = new ArrayList<>(); } } @Override protected void writePage() throws IOException { List> charactersByArticle = super.getCharactersByArticle(); - for( int i = 0; i < charactersByArticle.size(); i++) { - List textList = charactersByArticle.get(i); + for (List textList : charactersByArticle) { try { this.sortTextPositionList(textList); - } catch ( java.lang.IllegalArgumentException e) { + } catch (IllegalArgumentException e) { System.err.println(e); } - this.iterateThroughTextList(textList.iterator()) ; + this.iterateThroughTextList(textList.iterator()); } this.writeToOutputStream(this.getTextLineList()); } @@ -150,15 +149,13 @@ private void iterateThroughTextList(Iterator textIterator) { List textPositionList = new ArrayList(); while ( textIterator.hasNext() ) { - TextPosition textPosition = (TextPosition)textIterator.next(); + TextPosition textPosition = textIterator.next(); int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition); - if ( numberOfNewLines == 0 ) { - textPositionList.add(textPosition); - } else { + if (numberOfNewLines != 0) { this.writeTextPositionList(textPositionList); this.createNewEmptyNewLines(numberOfNewLines); - textPositionList.add(textPosition); } + textPositionList.add(textPosition); this.setPreviousTextPosition(textPosition); } if (!textPositionList.isEmpty()) { @@ -459,14 +456,12 @@ private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPositio double previousTextXPosition = textPosition1.getX(); double previousTextWidth = textPosition1.getWidth(); double previousTextEndXPosition = (previousTextXPosition + previousTextWidth); - double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition)); - return numberOfSpaces; + return (double) Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition)); } private char getCharacterFromTextPosition(final TextPosition textPosition) { String string = textPosition.getUnicode(); - char character = string.charAt(0); - return character; + return string.charAt(0); } private TextPosition getPreviousTextPosition() { diff --git a/src/test/java/com/giaybac/traprange/test/TestInvoice2.java b/src/test/java/com/giaybac/traprange/test/TestInvoice2.java index 9d52e94..64ea99f 100644 --- a/src/test/java/com/giaybac/traprange/test/TestInvoice2.java +++ b/src/test/java/com/giaybac/traprange/test/TestInvoice2.java @@ -13,6 +13,7 @@ import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.*; +import java.util.stream.Collectors; /** * How to run this file: @@ -48,7 +49,8 @@ public void test() throws IOException { "(\\d+\\/\\d+\\/\\d+\\s\\d+\\:\\d+)", "([a-z0-9A-Z]{5,})" }; - String patternString = "\\s+" + String.join("\\s+", patternStrings) + "\\s+"; + + String patternString = "\\s+" + Arrays.asList(patternStrings).stream().collect(Collectors.joining("\\s+"))+ "\\s+"; Pattern p = Pattern.compile(patternString); for (String line : lines) { Matcher matcher = p.matcher(line); diff --git a/test-command-line.sh b/test-command-line.sh index 648bbee..3b27440 100644 --- a/test-command-line.sh +++ b/test-command-line.sh @@ -4,21 +4,21 @@ rm -rf $home/result/* # Sample 1 idx=1 -java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1,-1" +java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1,-1" # Sample 2 idx=2 -java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1" +java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0,1" # Sample 3 idx=3 -java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -ep "0" +java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -ep "0" # Sample 4 idx=4 -java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0" +java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0" # Sample 5 idx=5 -java -jar traprange.lastest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0@0,1@0" +java -jar traprange.latest.jar -in "$home/sample-$idx.pdf" -out "$home/result/sample-$idx.html" -el "0@0,1@0"