Fix processing the end of an inline image.

ars18wrw · ars18wrw · commit e0df12db9cd8 · 2018-06-15T17:18:04.000+03:00
DEVSIX-1914
diff --git a/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java b/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
@@ -43,8 +43,8 @@ This file is part of the iText (R) project.
  */
 package com.itextpdf.kernel.pdf.canvas.parser.util;
 
-import com.itextpdf.kernel.PdfException;
 import com.itextpdf.io.source.PdfTokenizer;
+import com.itextpdf.kernel.PdfException;
 import com.itextpdf.kernel.pdf.PdfArray;
 import com.itextpdf.kernel.pdf.PdfDictionary;
 import com.itextpdf.kernel.pdf.PdfName;
@@ -53,8 +53,9 @@ This file is part of the iText (R) project.
 import com.itextpdf.kernel.pdf.PdfReader;
 import com.itextpdf.kernel.pdf.PdfStream;
 import com.itextpdf.kernel.pdf.filters.DoNothingFilter;
-import com.itextpdf.kernel.pdf.filters.IFilterHandler;
 import com.itextpdf.kernel.pdf.filters.FilterHandlers;
+import com.itextpdf.kernel.pdf.filters.FlateDecodeFilter;
+import com.itextpdf.kernel.pdf.filters.IFilterHandler;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
@@ -67,6 +68,8 @@ This file is part of the iText (R) project.
  */
 public final class InlineImageParsingUtils {
 
+    private static final byte[] EI = new byte[]{'E', 'I'};
+
     private InlineImageParsingUtils() {
     }
 
@@ -341,54 +344,30 @@ private static byte[] parseSamples(PdfDictionary imageDictionary, PdfDictionary
         }
 
 
-        // read all content until we reach an EI operator surrounded by whitespace.
-        // The following algorithm has two potential issues: what if the image stream
-        // contains <ws>EI<ws> ?
-        // Plus, there are some streams that don't have the <ws> before the EI operator
-        // it sounds like we would have to actually decode the content stream, which
-        // I'd rather avoid right now.
+        // read all content until we reach an EI operator followed by whitespace.
+        // then decode the content stream to check that bytes that were parsed are really all image bytes
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        ByteArrayOutputStream accumulated = new ByteArrayOutputStream();
         int ch;
         int found = 0;
         PdfTokenizer tokeniser = ps.getTokeniser();
-
         while ((ch = tokeniser.read()) != -1) {
-            if (found == 0 && PdfTokenizer.isWhitespace(ch)) {
-                found++;
-                accumulated.write(ch);
-            } else if (found == 1 && ch == 'E') {
-                found++;
-                accumulated.write(ch);
-            } else if (found == 1 && PdfTokenizer.isWhitespace(ch)) {
-                // this clause is needed if we have a white space character that is part of the image data
-                // followed by a whitespace character that precedes the EI operator.  In this case, we need
-                // to flush the first whitespace, then treat the current whitespace as the first potential
-                // character for the end of stream check.  Note that we don't increment 'found' here.
-                baos.write(accumulated.toByteArray());
-                accumulated.reset();
-                accumulated.write(ch);
-            } else if (found == 2 && ch == 'I') {
-                found++;
-                accumulated.write(ch);
-            } else if (found == 3 && PdfTokenizer.isWhitespace(ch)) {
-                byte[] tmp = baos.toByteArray();
-                if (inlineImageStreamBytesAreComplete(tmp, imageDictionary)) {
-                    return tmp;
-                }
-                baos.write(accumulated.toByteArray());
-                accumulated.reset();
-
-                baos.write(ch);
-                found = 0;
-
+            if (ch == 'E') {
+                baos.write(EI, 0, found); // probably some bytes were preserved so write them
+                found = 1; // just preserve 'E' and do not write it immediately
+            } else if (found == 1 && ch == 'I') {
+                found = 2; // just preserve 'EI' and do not write it immediately
             } else {
-                baos.write(accumulated.toByteArray());
-                accumulated.reset();
-
+                if (found == 2 && PdfTokenizer.isWhitespace(ch)) {
+                    byte[] tmp = baos.toByteArray();
+                    if (inlineImageStreamBytesAreComplete(tmp, imageDictionary)) {
+                        return tmp;
+                    }
+                }
+                baos.write(EI, 0, found); // probably some bytes were preserved so write them
                 baos.write(ch);
                 found = 0;
             }
+
         }
         throw new InlineImageParseException(PdfException.CannotFindImageDataOrEI);
     }
@@ -418,6 +397,7 @@ private static boolean inlineImageStreamBytesAreComplete(byte[] samples, PdfDict
             filters.put(PdfName.DCTDecode, stubfilter);
             filters.put(PdfName.JBIG2Decode, stubfilter);
             filters.put(PdfName.JPXDecode, stubfilter);
+            ((FlateDecodeFilter) filters.get(PdfName.FlateDecode)).setStrictDecoding(true);
             PdfReader.decodeBytes(samples, imageDictionary, filters);
         } catch (Exception ex) {
             return false;
diff --git a/kernel/src/main/java/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.java b/kernel/src/main/java/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.java
@@ -60,19 +60,61 @@ This file is part of the iText (R) project.
  */
 public class FlateDecodeFilter implements IFilterHandler {
 
+    /**
+     * Defines how the corrupted streams should be treated.
+     */
+    private boolean strictDecoding = false;
+
+    /**
+     * Creates a FlateDecodeFilter.
+     */
+    public FlateDecodeFilter() {
+        this(false);
+    }
+
+    /**
+     * Creates a FlateDecodeFilter.
+     *
+     * @param strictDecoding defines whether the decoder will try to read a corrupted stream
+     */
+    public FlateDecodeFilter(boolean strictDecoding) {
+        this.strictDecoding = strictDecoding;
+    }
+
+    /**
+     * Checks whether the decoder will try to read a corrupted stream (not strict) or not (strict)
+     *
+     * @return true if the decoder will try to read a corrupted stream otherwise false
+     */
+    public boolean isStrictDecoding() {
+        return strictDecoding;
+    }
+
+    /**
+     * Defines how the corrupted streams should be treated.
+     *
+     * @param strict true if the decoder should try to read a corrupted stream otherwise false
+     * @return the decoder
+     */
+    public FlateDecodeFilter setStrictDecoding(boolean strict) {
+        this.strictDecoding = strict;
+        return this;
+    }
+
     @Override
     public byte[] decode(byte[] b, PdfName filterName, PdfObject decodeParams, PdfDictionary streamDictionary) {
         byte[] res = flateDecode(b, true);
-        if (res == null)
+        if (res == null && !strictDecoding) {
             res = flateDecode(b, false);
+        }
         b = decodePredictor(res, decodeParams);
         return b;
     }
 
     /**
      * A helper to flateDecode.
      *
-     * @param in the input data
+     * @param in     the input data
      * @param strict {@code true} to read a correct stream. {@code false} to try to read a corrupted stream.
      * @return the decoded data
      */
@@ -89,45 +131,44 @@ public static byte[] flateDecode(byte[] in, boolean strict) {
             zip.close();
             out.close();
             return out.toByteArray();
-        }
-        catch (Exception e) {
+        } catch (Exception e) {
             if (strict)
                 return null;
             return out.toByteArray();
         }
     }
 
     /**
-     * @param in Input byte array.
+     * @param in           Input byte array.
      * @param decodeParams PdfDictionary of decodeParams.
      * @return a byte array
      */
     public static byte[] decodePredictor(byte[] in, PdfObject decodeParams) {
         if (decodeParams == null || decodeParams.getType() != PdfObject.DICTIONARY)
             return in;
-        PdfDictionary dic = (PdfDictionary)decodeParams;
+        PdfDictionary dic = (PdfDictionary) decodeParams;
         PdfObject obj = dic.get(PdfName.Predictor);
         if (obj == null || obj.getType() != PdfObject.NUMBER)
             return in;
-        int predictor = ((PdfNumber)obj).intValue();
+        int predictor = ((PdfNumber) obj).intValue();
         if (predictor < 10 && predictor != 2)
             return in;
         int width = 1;
         obj = dic.get(PdfName.Columns);
         if (obj != null && obj.getType() == PdfObject.NUMBER)
-            width = ((PdfNumber)obj).intValue();
+            width = ((PdfNumber) obj).intValue();
         int colors = 1;
         obj = dic.get(PdfName.Colors);
         if (obj != null && obj.getType() == PdfObject.NUMBER)
-            colors = ((PdfNumber)obj).intValue();
+            colors = ((PdfNumber) obj).intValue();
         int bpc = 8;
         obj = dic.get(PdfName.BitsPerComponent);
         if (obj != null && obj.getType() == PdfObject.NUMBER)
-            bpc = ((PdfNumber)obj).intValue();
+            bpc = ((PdfNumber) obj).intValue();
         DataInputStream dataStream = new DataInputStream(new ByteArrayInputStream(in));
         ByteArrayOutputStream fout = new ByteArrayOutputStream(in.length);
         int bytesPerPixel = colors * bpc / 8;
-        int bytesPerRow = (colors * width * bpc + 7)/8;
+        int bytesPerRow = (colors * width * bpc + 7) / 8;
         byte[] curr = new byte[bytesPerRow];
         byte[] prior = new byte[bytesPerRow];
         if (predictor == 2) {
@@ -136,7 +177,7 @@ public static byte[] decodePredictor(byte[] in, PdfObject decodeParams) {
                 for (int row = 0; row < numRows; row++) {
                     int rowStart = row * bytesPerRow;
                     for (int col = bytesPerPixel; col < bytesPerRow; col++) {
-                        in[rowStart + col] = (byte)(in[rowStart + col] + in[rowStart + col - bytesPerPixel]);
+                        in[rowStart + col] = (byte) (in[rowStart + col] + in[rowStart + col - bytesPerPixel]);
                     }
                 }
             }
@@ -174,7 +215,7 @@ public static byte[] decodePredictor(byte[] in, PdfObject decodeParams) {
                         curr[i] += (byte) (prior[i] / 2);
                     }
                     for (int i = bytesPerPixel; i < bytesPerRow; i++) {
-                        curr[i] += (byte) (((curr[i - bytesPerPixel] & 0xff) + (prior[i] & 0xff))/2);
+                        curr[i] += (byte) (((curr[i - bytesPerPixel] & 0xff) + (prior[i] & 0xff)) / 2);
                     }
                     break;
                 case 4: //PNG_FILTER_PAETH
@@ -201,7 +242,7 @@ public static byte[] decodePredictor(byte[] in, PdfObject decodeParams) {
                         } else {
                             ret = c;
                         }
-                        curr[i] += (byte)ret;
+                        curr[i] += (byte) ret;
                     }
                     break;
                 default:
@@ -210,10 +251,9 @@ public static byte[] decodePredictor(byte[] in, PdfObject decodeParams) {
             }
             try {
                 fout.write(curr);
-            }
-            catch (IOException ioe) {
+            } catch (IOException ioe) {
                 // Never happens
-                assert true: "Happens!";
+                assert true : "Happens!";
             }
 
             // Swap curr and prior