Skip to content

Commit c04a938

Browse files
authored
Merge commit from fork
* Security Patch * Throw Exception for EBCDIC Encoding * Mixed UTF-8 and UTF-16 Further mischief. I don't know if the examples truly are valid Xml, but PhpSpreadsheet is letting them sneak through.
1 parent 7c973ab commit c04a938

File tree

10 files changed

+77
-36
lines changed

10 files changed

+77
-36
lines changed

src/PhpSpreadsheet/Reader/Security/XmlScanner.php

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
class XmlScanner
88
{
9+
private const ENCODING_PATTERN = '/encoding\\s*=\\s*(["\'])(.+?)\\1/s';
10+
private const ENCODING_UTF7 = '/encoding\\s*=\\s*(["\'])UTF-7\\1/si';
11+
912
private string $pattern;
1013

1114
/** @var ?callable */
@@ -36,29 +39,41 @@ private static function forceString(mixed $arg): string
3639
private function toUtf8(string $xml): string
3740
{
3841
$charset = $this->findCharSet($xml);
42+
$foundUtf7 = $charset === 'UTF-7';
3943
if ($charset !== 'UTF-8') {
44+
$testStart = '/^.{0,4}\\s*<?xml/s';
45+
$startWithXml1 = preg_match($testStart, $xml);
4046
$xml = self::forceString(mb_convert_encoding($xml, 'UTF-8', $charset));
41-
42-
$charset = $this->findCharSet($xml);
43-
if ($charset !== 'UTF-8') {
44-
throw new Reader\Exception('Suspicious Double-encoded XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
47+
if ($startWithXml1 === 1 && preg_match($testStart, $xml) !== 1) {
48+
throw new Reader\Exception('Double encoding not permitted');
4549
}
50+
$foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1);
51+
$xml = preg_replace(self::ENCODING_PATTERN, '', $xml) ?? $xml;
52+
} else {
53+
$foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1);
54+
}
55+
if ($foundUtf7) {
56+
throw new Reader\Exception('UTF-7 encoding not permitted');
57+
}
58+
if (substr($xml, 0, Reader\Csv::UTF8_BOM_LEN) === Reader\Csv::UTF8_BOM) {
59+
$xml = substr($xml, Reader\Csv::UTF8_BOM_LEN);
4660
}
4761

4862
return $xml;
4963
}
5064

5165
private function findCharSet(string $xml): string
5266
{
53-
$patterns = [
54-
'/encoding\\s*=\\s*"([^"]*]?)"/',
55-
"/encoding\\s*=\\s*'([^']*?)'/",
56-
];
57-
58-
foreach ($patterns as $pattern) {
59-
if (preg_match($pattern, $xml, $matches)) {
60-
return strtoupper($matches[1]);
61-
}
67+
if (substr($xml, 0, 4) === "\x4c\x6f\xa7\x94") {
68+
throw new Reader\Exception('EBCDIC encoding not permitted');
69+
}
70+
$encoding = Reader\Csv::guessEncodingBom('', $xml);
71+
if ($encoding !== '') {
72+
return $encoding;
73+
}
74+
$xml = str_replace("\0", '', $xml);
75+
if (preg_match(self::ENCODING_PATTERN, $xml, $matches)) {
76+
return strtoupper($matches[2]);
6277
}
6378

6479
return 'UTF-8';
@@ -71,13 +86,15 @@ private function findCharSet(string $xml): string
7186
*/
7287
public function scan($xml): string
7388
{
89+
// Don't rely purely on libxml_disable_entity_loader()
90+
$pattern = '/\\0*' . implode('\\0*', str_split($this->pattern)) . '\\0*/';
91+
7492
$xml = "$xml";
93+
if (preg_match($pattern, $xml)) {
94+
throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
95+
}
7596

7697
$xml = $this->toUtf8($xml);
77-
78-
// Don't rely purely on libxml_disable_entity_loader()
79-
$pattern = '/\\0?' . implode('\\0?', str_split($this->pattern)) . '\\0?/';
80-
8198
if (preg_match($pattern, $xml)) {
8299
throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
83100
}
@@ -90,7 +107,7 @@ public function scan($xml): string
90107
}
91108

92109
/**
93-
* Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
110+
* Scan the XML for use of <!ENTITY to prevent XXE/XEE attacks.
94111
*/
95112
public function scanFile(string $filestream): string
96113
{

src/PhpSpreadsheet/Reader/Xml.php

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
use PhpOffice\PhpSpreadsheet\Settings;
1919
use PhpOffice\PhpSpreadsheet\Shared\Date;
2020
use PhpOffice\PhpSpreadsheet\Shared\File;
21-
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
2221
use PhpOffice\PhpSpreadsheet\Spreadsheet;
2322
use PhpOffice\PhpSpreadsheet\Worksheet\SheetView;
2423
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
@@ -91,7 +90,8 @@ public function canRead(string $filename): bool
9190
];
9291

9392
// Open file
94-
$data = file_get_contents($filename) ?: '';
93+
$data = (string) file_get_contents($filename);
94+
$data = $this->getSecurityScannerOrThrow()->scan($data);
9595

9696
// Why?
9797
//$data = str_replace("'", '"', $data); // fix headers with single quote
@@ -106,14 +106,6 @@ public function canRead(string $filename): bool
106106
}
107107
}
108108

109-
// Retrieve charset encoding
110-
if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $data, $matches)) {
111-
$charSet = strtoupper($matches[1]);
112-
if (preg_match('/^ISO-8859-\d[\dL]?$/i', $charSet) === 1) {
113-
$data = StringHelper::convertEncoding($data, 'UTF-8', $charSet);
114-
$data = (string) preg_replace('/(<?xml.*encoding=[\'"]).*?([\'"].*?>)/um', '$1' . 'UTF-8' . '$2', $data, 1);
115-
}
116-
}
117109
$this->fileContents = $data;
118110

119111
return $valid;

tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ public static function providerValidXML(): array
3030
self::assertNotFalse($glob);
3131
foreach ($glob as $file) {
3232
$filename = realpath($file);
33-
$expectedResult = file_get_contents($file);
33+
$expectedResult = (string) file_get_contents($file);
34+
if (preg_match('/UTF-16(LE|BE)?/', $file, $matches) == 1) {
35+
$expectedResult = (string) mb_convert_encoding($expectedResult, 'UTF-8', $matches[0]);
36+
$expectedResult = preg_replace('/encoding\\s*=\\s*[\'"]UTF-\\d+(LE|BE)?[\'"]/', '', $expectedResult) ?? $expectedResult;
37+
}
3438
$tests[basename($file)] = [$filename, $expectedResult];
3539
}
3640

@@ -132,19 +136,47 @@ public function testEncodingAllowsMixedCase(): void
132136
self::assertSame($input, $output);
133137
}
134138

135-
public function testUtf7Whitespace(): void
139+
/**
140+
* @dataProvider providerInvalidXlsx
141+
*/
142+
public function testInvalidXlsx(string $filename, string $message): void
136143
{
137144
$this->expectException(ReaderException::class);
138-
$this->expectExceptionMessage('Double-encoded');
145+
$this->expectExceptionMessage($message);
139146
$reader = new Xlsx();
140-
$reader->load('tests/data/Reader/XLSX/utf7white.dontuse');
147+
$reader->load("tests/data/Reader/XLSX/$filename");
141148
}
142149

143-
public function testUtf8Entity(): void
150+
public static function providerInvalidXlsx(): array
151+
{
152+
return [
153+
['utf7white.dontuse', 'UTF-7 encoding not permitted'],
154+
['utf7quoteorder.dontuse', 'UTF-7 encoding not permitted'],
155+
['utf8and16.dontuse', 'Double encoding not permitted'],
156+
['utf8and16.entity.dontuse', 'Detected use of ENTITY'],
157+
['utf8entity.dontuse', 'Detected use of ENTITY'],
158+
['utf16entity.dontuse', 'Detected use of ENTITY'],
159+
['ebcdic.dontuse', 'EBCDIC encoding not permitted'],
160+
];
161+
}
162+
163+
/**
164+
* @dataProvider providerValidUtf16
165+
*/
166+
public function testValidUtf16(string $filename): void
144167
{
145-
$this->expectException(ReaderException::class);
146-
$this->expectExceptionMessage('Detected use of ENTITY');
147168
$reader = new Xlsx();
148-
$reader->load('tests/data/Reader/XLSX/utf8entity.dontuse');
169+
$spreadsheet = $reader->load("tests/data/Reader/XLSX/$filename");
170+
$sheet = $spreadsheet->getActiveSheet();
171+
self::assertSame(1, $sheet->getCell('A1')->getValue());
172+
$spreadsheet->disconnectWorksheets();
173+
}
174+
175+
public static function providerValidUtf16(): array
176+
{
177+
return [
178+
['utf16be.xlsx'],
179+
['utf16be.bom.xlsx'],
180+
];
149181
}
150182
}

tests/data/Reader/XLSX/ebcdic.dontuse

5.45 KB
Binary file not shown.
7.93 KB
Binary file not shown.

tests/data/Reader/XLSX/utf16be.xlsx

7.93 KB
Binary file not shown.
5.58 KB
Binary file not shown.
5.36 KB
Binary file not shown.
5.49 KB
Binary file not shown.
5.58 KB
Binary file not shown.

0 commit comments

Comments
 (0)