Skip to content

Commit 5de8298

Browse files
authored
Html Reader Not Handling non-ASCII Data Correctly (#2943)
* Html Reader Not Handling non-ASCII Data Correctly Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than). * Better Implementation Use regexp to escape non-ASCII. Less kludgey, less reliant on the vagaries of the PHP maintainers. * Additional Tests Test non-ASCII outside of cell contents: sheet title, image alt attribute. * Apply Same Change in Second Location Forgot to change loadFromString. * Additional Test Confirm escaped ampersand is handled correctly.
1 parent db57af0 commit 5de8298

File tree

4 files changed

+86
-5
lines changed

4 files changed

+86
-5
lines changed

src/PhpSpreadsheet/Reader/Html.php

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ private static function containsTags(string $data): bool
201201
/**
202202
* Loads Spreadsheet from file.
203203
*/
204-
protected function loadSpreadsheetFromFile(string $filename): Spreadsheet
204+
public function loadSpreadsheetFromFile(string $filename): Spreadsheet
205205
{
206206
// Create new Spreadsheet
207207
$spreadsheet = new Spreadsheet();
@@ -651,7 +651,13 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
651651
// Reload the HTML file into the DOM object
652652
try {
653653
$convert = $this->securityScanner->scanFile($filename);
654-
$loaded = $dom->loadHTML($convert);
654+
$lowend = "\u{80}";
655+
$highend = "\u{10ffff}";
656+
$regexp = "/[$lowend-$highend]/u";
657+
/** @var callable */
658+
$callback = [self::class, 'replaceNonAscii'];
659+
$convert = preg_replace_callback($regexp, $callback, $convert);
660+
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
655661
} catch (Throwable $e) {
656662
$loaded = false;
657663
}
@@ -662,6 +668,11 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
662668
return $this->loadDocument($dom, $spreadsheet);
663669
}
664670

671+
private static function replaceNonAscii(array $matches): string
672+
{
673+
return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
674+
}
675+
665676
/**
666677
* Spreadsheet from content.
667678
*
@@ -674,7 +685,13 @@ public function loadFromString($content, ?Spreadsheet $spreadsheet = null): Spre
674685
// Reload the HTML file into the DOM object
675686
try {
676687
$convert = $this->securityScanner->scan($content);
677-
$loaded = $dom->loadHTML($convert);
688+
$lowend = "\u{80}";
689+
$highend = "\u{10ffff}";
690+
$regexp = "/[$lowend-$highend]/u";
691+
/** @var callable */
692+
$callback = [self::class, 'replaceNonAscii'];
693+
$convert = preg_replace_callback($regexp, $callback, $convert);
694+
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
678695
} catch (Throwable $e) {
679696
$loaded = false;
680697
}

tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public function testCanInsertImage(): void
1313

1414
$html = '<table>
1515
<tr>
16-
<td><img src="' . $imagePath . '" alt="test image"></td>
16+
<td><img src="' . $imagePath . '" alt="test image voilà"></td>
1717
</tr>
1818
</table>';
1919
$filename = HtmlHelper::createHtml($html);
@@ -24,7 +24,7 @@ public function testCanInsertImage(): void
2424
$drawing = $firstSheet->getDrawingCollection()[0];
2525
self::assertEquals($imagePath, $drawing->getPath());
2626
self::assertEquals('A1', $drawing->getCoordinates());
27-
self::assertEquals('test image', $drawing->getName());
27+
self::assertEquals('test image voilà', $drawing->getName());
2828
self::assertEquals('100', $drawing->getWidth());
2929
self::assertEquals('100', $drawing->getHeight());
3030
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?php
2+
3+
namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
4+
5+
use PhpOffice\PhpSpreadsheet\Reader\Html;
6+
use PHPUnit\Framework\TestCase;
7+
8+
class Issue2942Test extends TestCase
9+
{
10+
public function testLoadFromString(): void
11+
{
12+
$content = '<table><tbody><tr><td>éàâèî</td></tr></tbody></table>';
13+
$reader = new Html();
14+
$spreadsheet = $reader->loadFromString($content);
15+
$sheet = $spreadsheet->getActiveSheet();
16+
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
17+
}
18+
19+
public function testLoadFromFile(): void
20+
{
21+
$file = 'tests/data/Reader/HTML/utf8chars.html';
22+
$reader = new Html();
23+
$spreadsheet = $reader->loadSpreadsheetFromFile($file);
24+
$sheet = $spreadsheet->getActiveSheet();
25+
self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle());
26+
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
27+
self::assertSame('αβγδε', $sheet->getCell('B1')->getValue());
28+
self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue());
29+
self::assertSame('אבגדה', $sheet->getCell('B2')->getValue());
30+
self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue());
31+
self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue());
32+
self::assertSame('അആ', $sheet->getCell('B3')->getValue());
33+
self::assertSame('กขฃ', $sheet->getCell('C3')->getValue());
34+
self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue());
35+
}
36+
}

tests/data/Reader/HTML/utf8chars.html

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<!-- deliberately do not identify charset for this test -->
5+
<title>Test Utf-8 characters voilà</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>éàâèî</td><!-- Latin1 -->
12+
<td>αβγδε</td><!-- Greek -->
13+
</tr>
14+
<tr>
15+
<td>𐐁𐐂𐐃 &amp; だけち</td><!-- Osmanya (not in BMP) and Hiragana -->
16+
<td>אבגדה</td><!-- Hebrew -->
17+
<td>𪔀𪔁𪔂</td><!-- CJK Unified Ideographs Extension B (not in BMP) -->
18+
</tr>
19+
<tr>
20+
<td>᠐᠑᠒</td><!-- Mongolian -->
21+
<td>അആ</td><!-- Malayalam -->
22+
<td>กขฃ</td><!-- Thai -->
23+
<td>✀✐✠</td><!-- Dingbats -->
24+
</tr>
25+
</tbody>
26+
</table>
27+
</body>
28+
</html>

0 commit comments

Comments
 (0)