Skip to content

Commit 335ff32

Browse files
authored
Merge pull request #234 from packdat/Stream-Filtering
Improved stream-decoding
2 parents 7a2afc5 + 7088826 commit 335ff32

File tree

10 files changed

+203
-135
lines changed

10 files changed

+203
-135
lines changed

PdfSharpCore/Pdf.Advanced/PdfContent.cs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,13 @@ public bool Compressed
7878
{
7979
if (value)
8080
{
81-
PdfItem filter = Elements["/Filter"];
81+
PdfItem filter = Elements[PdfStream.Keys.Filter];
8282
if (filter == null)
8383
{
8484
byte[] bytes = Filtering.FlateDecode.Encode(Stream.Value, _document.Options.FlateEncodeMode);
8585
Stream.Value = bytes;
86-
Elements.SetInteger("/Length", Stream.Length);
87-
Elements.SetName("/Filter", "/FlateDecode");
86+
Elements.SetInteger(PdfStream.Keys.Length, Stream.Length);
87+
Elements.SetName(PdfStream.Keys.Filter, "/FlateDecode");
8888
}
8989
}
9090
}
@@ -97,15 +97,17 @@ void Decode()
9797
{
9898
if (Stream != null && Stream.Value != null)
9999
{
100-
PdfItem item = Elements["/Filter"];
100+
PdfItem item = Elements[PdfStream.Keys.Filter];
101101
if (item != null)
102102
{
103-
byte[] bytes = Filtering.Decode(Stream.Value, item);
103+
var decodeParms = Elements[PdfStream.Keys.DecodeParms];
104+
byte[] bytes = Filtering.Decode(Stream.Value, item, decodeParms);
104105
if (bytes != null)
105106
{
106107
Stream.Value = bytes;
107-
Elements.Remove("/Filter");
108-
Elements.SetInteger("/Length", Stream.Length);
108+
Elements.Remove(PdfStream.Keys.Filter);
109+
Elements.Remove(PdfStream.Keys.DecodeParms);
110+
Elements.SetInteger(PdfStream.Keys.Length, Stream.Length);
109111
}
110112
}
111113
}

PdfSharpCore/Pdf.Advanced/PdfCrossReferenceStream.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,10 @@ public PdfCrossReferenceStream(PdfDocument document)
4747

4848
public readonly List<CrossReferenceStreamEntry> Entries = new List<CrossReferenceStreamEntry>();
4949

50+
[DebuggerDisplay("{Type} {Field2} {Field3}")]
5051
public struct CrossReferenceStreamEntry
5152
{
52-
// Reference: TABLE 3.16 Entries in a cross-refernece stream / Page 109
53+
// Reference: TABLE 3.16 Entries in a cross-reference stream / Page 109
5354

5455
public uint Type; // 0, 1, or 2.
5556

PdfSharpCore/Pdf.Filters/Filter.cs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,15 @@ namespace PdfSharpCore.Pdf.Filters
3737
/// </summary>
3838
public class FilterParms
3939
{
40-
// not yet used
40+
/// <summary>
41+
/// Gets the decoding-parameters for a filter. May be null
42+
/// </summary>
43+
public PdfDictionary DecodeParms { get; private set; }
44+
45+
public FilterParms(PdfDictionary decodeParms)
46+
{
47+
DecodeParms = decodeParms;
48+
}
4149
}
4250

4351
/// <summary>
@@ -68,9 +76,9 @@ public virtual byte[] Encode(string rawString)
6876
/// <summary>
6977
/// Decodes the specified data.
7078
/// </summary>
71-
public byte[] Decode(byte[] data)
79+
public byte[] Decode(byte[] data, PdfDictionary decodeParms)
7280
{
73-
return Decode(data, null);
81+
return Decode(data, new FilterParms(decodeParms));
7482
}
7583

7684
/// <summary>

PdfSharpCore/Pdf.Filters/Filtering.cs

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,27 +191,34 @@ public static byte[] Decode(byte[] data, string filterName)
191191
{
192192
Filter filter = GetFilter(filterName);
193193
if (filter != null)
194-
return filter.Decode(data, null);
194+
return filter.Decode(data, (PdfDictionary)null);
195195
return null;
196196
}
197197

198198
/// <summary>
199199
/// Decodes the data with the specified filter.
200200
/// </summary>
201-
public static byte[] Decode(byte[] data, PdfItem filterItem)
201+
public static byte[] Decode(byte[] data, PdfItem filterItem, PdfItem decodeParms)
202202
{
203203
byte[] result = null;
204-
if (filterItem is PdfName)
204+
if (filterItem is PdfName && (decodeParms == null || decodeParms is PdfDictionary))
205205
{
206206
Filter filter = GetFilter(filterItem.ToString());
207207
if (filter != null)
208-
result = filter.Decode(data);
208+
result = filter.Decode(data, decodeParms as PdfDictionary);
209209
}
210-
else if (filterItem is PdfArray)
210+
else if (filterItem is PdfArray itemArray && (decodeParms == null || decodeParms is PdfArray))
211211
{
212-
PdfArray array = (PdfArray)filterItem;
213-
foreach (PdfItem item in array)
214-
data = Decode(data, item);
212+
var decodeArray = decodeParms as PdfArray;
213+
// array length of filter and decode parms should match. if they dont, return data unmodified
214+
if (decodeArray != null && decodeArray.Elements.Count != itemArray.Elements.Count)
215+
return data;
216+
for (var i = 0; i < itemArray.Elements.Count; i++)
217+
{
218+
var item = itemArray.Elements[i];
219+
var parms = decodeArray != null ? decodeArray.Elements[i] : null;
220+
data = Decode(data, item, parms);
221+
}
215222
result = data;
216223
}
217224
return result;

PdfSharpCore/Pdf.Filters/FlateDecode.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ public override byte[] Decode(byte[] data, FilterParms parms)
9898
msOutput.Flush();
9999
if (msOutput.Length >= 0)
100100
{
101+
if (parms.DecodeParms != null)
102+
return StreamDecoder.Decode(msOutput.ToArray(), parms.DecodeParms);
101103
return msOutput.ToArray();
102104
}
103105
return null;

PdfSharpCore/Pdf.Filters/LzwDecode.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ public override byte[] Decode(byte[] data, FilterParms parms)
101101

102102
if (outputStream.Length >= 0)
103103
{
104+
if (parms.DecodeParms != null)
105+
return StreamDecoder.Decode(outputStream.ToArray(), parms.DecodeParms);
104106
return outputStream.ToArray();
105107
}
106108
return null;
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
using PdfSharpCore.Pdf.IO;
2+
using System;
3+
4+
namespace PdfSharpCore.Pdf.Filters
5+
{
6+
internal static class PngFilter
7+
{
8+
/// <summary>
9+
/// Implements PNG-Filtering according to the PNG-specification<br></br>
10+
/// see: https://datatracker.ietf.org/doc/html/rfc2083#section-6
11+
/// </summary>
12+
/// <param name="stride">The width of a scanline in bytes</param>
13+
/// <param name="bpp">Bytes per pixel</param>
14+
/// <param name="inData">The input data</param>
15+
/// <param name="inData">The target array where the unfiltered data is stored</param>
16+
/// <returns></returns>
17+
internal static void Unfilter(int stride, int bpp, byte[] inData, byte[] outData)
18+
{
19+
var prevRow = new byte[stride];
20+
var row = new byte[stride];
21+
var pos = 0;
22+
var outIndex = 0;
23+
while (pos < inData.Length)
24+
{
25+
Array.Copy(inData, pos + 1, row, 0, stride);
26+
var filterType = inData[pos];
27+
if (filterType > 4)
28+
throw new PdfReaderException(string.Format("Unexpected Png-Predictor {0} in Xref Stream. Expected 0 to 4.", filterType));
29+
switch (filterType)
30+
{
31+
case 0: // None
32+
for (var i = 0; i < row.Length; i++)
33+
outData[outIndex++] = row[i];
34+
break;
35+
case 1: // Sub
36+
for (var i = 0; i < row.Length; i++)
37+
{
38+
var left = i < bpp ? 0 : outData[outIndex - bpp];
39+
outData[outIndex++] = (byte)(row[i] + left);
40+
}
41+
break;
42+
case 2: // Up
43+
for (var i = 0; i < row.Length; i++)
44+
outData[outIndex++] = (byte)(row[i] + prevRow[i]);
45+
break;
46+
case 3: // Average
47+
for (var i = 0; i < row.Length; i++)
48+
{
49+
var left = i < bpp ? 0 : outData[outIndex - bpp];
50+
outData[outIndex++] = (byte)(row[i] + (byte)((left + prevRow[i]) / 2));
51+
}
52+
break;
53+
case 4: // Paeth
54+
for (var i = 0; i < row.Length; i++)
55+
{
56+
var left = i < bpp ? (byte)0 : outData[outIndex - bpp];
57+
var above = prevRow[i];
58+
var aboveLeft = i < bpp ? (byte)0 : prevRow[i - bpp];
59+
outData[outIndex++] = (byte)(row[i] + PaethPredictor(left, above, aboveLeft));
60+
}
61+
break;
62+
}
63+
// remember current scanline
64+
Array.Copy(outData, outIndex - stride, prevRow, 0, stride);
65+
pos += stride + 1; // each scanline is preceded by a predictor-byte
66+
}
67+
}
68+
69+
// https://datatracker.ietf.org/doc/html/rfc2083#page-36
70+
private static byte PaethPredictor(byte a, byte b, byte c)
71+
{
72+
var p = a + b - c;
73+
var pa = Math.Abs(p - a);
74+
var pb = Math.Abs(p - b);
75+
var pc = Math.Abs(p - c);
76+
if (pa <= pb && pa <= pc)
77+
return a;
78+
else if (pb <= pc)
79+
return b;
80+
return c;
81+
}
82+
}
83+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
using PdfSharpCore.Pdf.IO;
2+
using System;
3+
4+
namespace PdfSharpCore.Pdf.Filters
5+
{
6+
internal static class StreamDecoder
7+
{
8+
// PdfReference, chapter 7.4.4.3
9+
10+
/// <summary>
11+
/// Further decodes a stream of bytes that were processed by the Flate- or LZW-decoder.
12+
/// </summary>
13+
/// <param name="data">The data to decode</param>
14+
/// <param name="decodeParms">Parameters for the decoder. If this is null, <paramref name="data"/> is returned unchanged</param>
15+
/// <returns>The decoded data as a byte-array</returns>
16+
/// <exception cref="PdfReaderException"></exception>
17+
/// <exception cref="NotImplementedException"></exception>
18+
public static byte[] Decode(byte[] data, PdfDictionary decodeParms)
19+
{
20+
if (decodeParms == null)
21+
return data;
22+
23+
var predictor = decodeParms.Elements.GetInteger("/Predictor");
24+
var colors = decodeParms.Elements.GetInteger("/Colors");
25+
var bpc = decodeParms.Elements.GetInteger("/BitsPerComponent");
26+
var columns = decodeParms.Elements.GetInteger("/Columns");
27+
28+
// set up defaults according to the spec
29+
if (predictor < 1)
30+
predictor = 1;
31+
if (colors < 1)
32+
colors = 1;
33+
if (bpc < 1)
34+
bpc = 8;
35+
if (columns < 1)
36+
columns = 1;
37+
38+
if (predictor == 1) // no prediction, return data as is
39+
return data;
40+
41+
// TIFF predictor. TODO: implement
42+
if (predictor == 2)
43+
throw new NotImplementedException("TIFF predictor is not implemented");
44+
45+
// PNG predictors
46+
if (predictor >= 10 && predictor <= 15)
47+
{
48+
if (bpc != 1 && bpc != 2 && bpc != 4 && bpc != 8 && bpc != 16)
49+
throw new PdfReaderException("Invalid number of bits per component");
50+
var stride = (bpc * colors * columns + 7) / 8;
51+
var rows = data.Length / (stride + 1);
52+
var unfilteredData = new byte[rows * stride];
53+
PngFilter.Unfilter(stride, (bpc * colors + 7) / 8, data, unfilteredData);
54+
return unfilteredData;
55+
}
56+
57+
throw new PdfReaderException("Invalid predictor " + predictor);
58+
}
59+
}
60+
}

PdfSharpCore/Pdf.IO/Parser.cs

Lines changed: 5 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,12 @@
3535
using PdfSharpCore.Exceptions;
3636
using PdfSharpCore.Internal;
3737
using PdfSharpCore.Pdf.Advanced;
38-
using PdfSharpCore.Pdf.Internal;
3938
using PdfSharpCore.Pdf.IO.enums;
4039

4140
namespace PdfSharpCore.Pdf.IO
4241
{
4342
/*
44-
Direct and indireckt objects
43+
Direct and indirect objects
4544
4645
* If a simple object (boolean, integer, number, date, string, rectangle etc.) is referenced indirect,
4746
the parser reads this objects immediatly and consumes the indirection.
@@ -52,7 +51,7 @@ is returned.
5251
* If a composite object is a direct object, no PdfReference is created and the object is
5352
parsed immediatly.
5453
55-
* A refernece to a non existing object is specified as legal, therefore null is returned.
54+
* A reference to a non existing object is specified as legal, therefore null is returned.
5655
*/
5756

5857
/// <summary>
@@ -1256,16 +1255,7 @@ private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
12561255
Debug.Assert(xrefStream.Stream != null);
12571256
//string sValue = new RawEncoding().GetString(xrefStream.Stream.UnfilteredValue,);
12581257
//sValue.GetType();
1259-
byte[] bytesRaw = xrefStream.Stream.UnfilteredValue;
1260-
byte[] bytes = bytesRaw;
1261-
1262-
// HACK: Should be done in UnfilteredValue.
1263-
if (xrefStream.Stream.HasDecodeParams)
1264-
{
1265-
int predictor = xrefStream.Stream.DecodePredictor;
1266-
int columns = xrefStream.Stream.DecodeColumns;
1267-
bytes = DecodeCrossReferenceStream(bytesRaw, columns, predictor);
1268-
}
1258+
var bytes = xrefStream.Stream.UnfilteredValue;
12691259

12701260
#if DEBUG_
12711261
for (int idx = 0; idx < bytes.Length; idx++)
@@ -1351,7 +1341,7 @@ private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
13511341
//// (PDF Reference Implementation Notes 15).
13521342

13531343
int position = (int)item.Field2;
1354-
objectID = ReadObjectNumber(position);
1344+
objectID = ReadObjectNumber(position);
13551345
#if DEBUG
13561346
if (objectID.ObjectNumber == 1074)
13571347
GetType();
@@ -1364,7 +1354,7 @@ private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
13641354
#if DEBUG
13651355
GetType();
13661356
#endif
1367-
// Add iref for all uncrompressed objects.
1357+
// Add iref for all uncompressed objects.
13681358
xrefTable.Add(new PdfReference(objectID, position));
13691359

13701360
}
@@ -1771,44 +1761,6 @@ private class ParserState
17711761
public Symbol Symbol;
17721762
}
17731763

1774-
private byte[] DecodeCrossReferenceStream(byte[] bytes, int columns, int predictor)
1775-
{
1776-
int size = bytes.Length;
1777-
if (predictor < 10 || predictor > 15)
1778-
throw new ArgumentException("Invalid predictor.", "predictor");
1779-
1780-
int rowSizeRaw = columns + 1;
1781-
1782-
if (size % rowSizeRaw != 0)
1783-
throw new ArgumentException("Columns and size of array do not match.");
1784-
1785-
int rows = size / rowSizeRaw;
1786-
1787-
byte[] result = new byte[rows * columns];
1788-
#if DEBUG
1789-
for (int i = 0; i < result.Length; ++i)
1790-
result[i] = 88;
1791-
#endif
1792-
1793-
for (int row = 0; row < rows; ++row)
1794-
{
1795-
if (bytes[row * rowSizeRaw] != 2)
1796-
throw new ArgumentException("Invalid predictor in array.");
1797-
1798-
for (int col = 0; col < columns; ++col)
1799-
{
1800-
// Copy data for first row.
1801-
if (row == 0)
1802-
result[row * columns + col] = bytes[row * rowSizeRaw + col + 1];
1803-
else
1804-
{
1805-
// For other rows, add previous row.
1806-
result[row * columns + col] = (byte)(result[row * columns - columns + col] + bytes[row * rowSizeRaw + col + 1]);
1807-
}
1808-
}
1809-
}
1810-
return result;
1811-
}
18121764

18131765
private readonly PdfDocument _document;
18141766
private readonly Lexer _lexer;

0 commit comments

Comments
 (0)