Html Reader Not Handling non-ASCII Data Correctly (#2943)

oleibman · web-flow · commit 5de82981d858 · 2022-07-16T22:08:44.000-07:00
* Html Reader Not Handling non-ASCII Data Correctly Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than). * Better Implementation Use regexp to escape non-ASCII. Less kludgey, less reliant on the vagaries of the PHP maintainers. * Additional Tests Test non-ASCII outside of cell contents: sheet title, image alt attribute. * Apply Same Change in Second Location Forgot to change loadFromString. * Additional Test Confirm escaped ampersand is handled correctly.
diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php
@@ -201,7 +201,7 @@ private static function containsTags(string $data): bool
     /**
      * Loads Spreadsheet from file.
      */
-    protected function loadSpreadsheetFromFile(string $filename): Spreadsheet
+    public function loadSpreadsheetFromFile(string $filename): Spreadsheet
     {
         // Create new Spreadsheet
         $spreadsheet = new Spreadsheet();
@@ -651,7 +651,13 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
         // Reload the HTML file into the DOM object
         try {
             $convert = $this->securityScanner->scanFile($filename);
-            $loaded = $dom->loadHTML($convert);
+            $lowend = "\u{80}";
+            $highend = "\u{10ffff}";
+            $regexp = "/[$lowend-$highend]/u";
+            /** @var callable */
+            $callback = [self::class, 'replaceNonAscii'];
+            $convert = preg_replace_callback($regexp, $callback, $convert);
+            $loaded = ($convert === null) ? false : $dom->loadHTML($convert);
         } catch (Throwable $e) {
             $loaded = false;
         }
@@ -662,6 +668,11 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
         return $this->loadDocument($dom, $spreadsheet);
     }
 
+    private static function replaceNonAscii(array $matches): string
+    {
+        return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
+    }
+
     /**
      * Spreadsheet from content.
      *
@@ -674,7 +685,13 @@ public function loadFromString($content, ?Spreadsheet $spreadsheet = null): Spre
         //    Reload the HTML file into the DOM object
         try {
             $convert = $this->securityScanner->scan($content);
-            $loaded = $dom->loadHTML($convert);
+            $lowend = "\u{80}";
+            $highend = "\u{10ffff}";
+            $regexp = "/[$lowend-$highend]/u";
+            /** @var callable */
+            $callback = [self::class, 'replaceNonAscii'];
+            $convert = preg_replace_callback($regexp, $callback, $convert);
+            $loaded = ($convert === null) ? false : $dom->loadHTML($convert);
         } catch (Throwable $e) {
             $loaded = false;
         }
diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php
@@ -13,7 +13,7 @@ public function testCanInsertImage(): void
 
         $html = '<table>
                     <tr>
-                        <td><img src="' . $imagePath . '" alt="test image"></td>
+                        <td><img src="' . $imagePath . '" alt="test image voilà"></td>
                     </tr>
                 </table>';
         $filename = HtmlHelper::createHtml($html);
@@ -24,7 +24,7 @@ public function testCanInsertImage(): void
         $drawing = $firstSheet->getDrawingCollection()[0];
         self::assertEquals($imagePath, $drawing->getPath());
         self::assertEquals('A1', $drawing->getCoordinates());
-        self::assertEquals('test image', $drawing->getName());
+        self::assertEquals('test image voilà', $drawing->getName());
         self::assertEquals('100', $drawing->getWidth());
         self::assertEquals('100', $drawing->getHeight());
     }
diff --git a/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php
@@ -0,0 +1,36 @@
+<?php
+
+namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
+
+use PhpOffice\PhpSpreadsheet\Reader\Html;
+use PHPUnit\Framework\TestCase;
+
+class Issue2942Test extends TestCase
+{
+    public function testLoadFromString(): void
+    {
+        $content = '<table><tbody><tr><td>éàâèî</td></tr></tbody></table>';
+        $reader = new Html();
+        $spreadsheet = $reader->loadFromString($content);
+        $sheet = $spreadsheet->getActiveSheet();
+        self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
+    }
+
+    public function testLoadFromFile(): void
+    {
+        $file = 'tests/data/Reader/HTML/utf8chars.html';
+        $reader = new Html();
+        $spreadsheet = $reader->loadSpreadsheetFromFile($file);
+        $sheet = $spreadsheet->getActiveSheet();
+        self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle());
+        self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
+        self::assertSame('αβγδε', $sheet->getCell('B1')->getValue());
+        self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue());
+        self::assertSame('אבגדה', $sheet->getCell('B2')->getValue());
+        self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue());
+        self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue());
+        self::assertSame('അആ', $sheet->getCell('B3')->getValue());
+        self::assertSame('กขฃ', $sheet->getCell('C3')->getValue());
+        self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue());
+    }
+}
diff --git a/tests/data/Reader/HTML/utf8chars.html b/tests/data/Reader/HTML/utf8chars.html
@@ -0,0 +1,28 @@
+<!DOCTYPE html>
+<html>
+<head>
+<!-- deliberately do not identify charset for this test -->
+<title>Test Utf-8 characters voilà</title>
+</head>
+<body>
+<table>
+    <tbody>
+    <tr>
+        <td>éàâèî</td><!-- Latin1 -->
+        <td>αβγδε</td><!-- Greek -->
+    </tr>
+    <tr>
+        <td>𐐁𐐂𐐃 &amp; だけち</td><!-- Osmanya (not in BMP) and Hiragana -->
+        <td>אבגדה</td><!-- Hebrew -->
+        <td>𪔀𪔁𪔂</td><!-- CJK Unified Ideographs Extension B (not in BMP) -->
+    </tr>
+    <tr>
+        <td>᠐᠑᠒</td><!-- Mongolian -->
+        <td>അആ</td><!-- Malayalam -->
+        <td>กขฃ</td><!-- Thai -->
+        <td>✀✐✠</td><!-- Dingbats -->
+    </tr>
+    </tbody>
+</table>
+</body>
+</html>