Add support for raw_html extraction in html parser (#341)

akariv · web-flow · commit 3b06de8be20b · 2020-11-09T13:47:28.000+03:00
* Add support for raw_html extraction in html parser

* Adhere better to the tabulator standard
diff --git a/README.md b/README.md
@@ -668,13 +668,15 @@ Supports simple tables (no merged cells) with any legal combination of the td, t
 Usually `foramt='html'` would need to be specified explicitly as web URLs don't always use the `.html` extension.
 
 ```python
-stream = Stream('http://example.com/some/page.aspx', format='html' selector='.content .data table#id1')
+stream = Stream('http://example.com/some/page.aspx', format='html' selector='.content .data table#id1', raw_html=True)
 ```
 
 **Options**
 
 - **selector**: CSS selector for specifying which `table` element to extract. By default it's `table`, which takes the first `table` element in the document. If empty, will assume the entire page is the table to be extracted (useful with some Excel formats).
 
+- **raw_html**: False (default) to extract the textual contents of each cell. True to return the inner html without modification.
+
 ### Custom file sources and formats
 
 Tabulator is written with extensibility in mind, allowing you to add support for
diff --git a/data/table3.html b/data/table3.html
@@ -25,7 +25,7 @@
         </tr>
         <tr>
             <td>1</td>
-            <td>english</td>
+            <td><b>english</b></td>
         </tr>
         <tr>
             <td>2</td>
diff --git a/tabulator/parsers/html.py b/tabulator/parsers/html.py
@@ -19,15 +19,17 @@ class HTMLTableParser(Parser):
 
     options = [
         'selector',
+        'raw_html'
     ]
 
-    def __init__(self, loader, force_parse=False, selector='table'):
+    def __init__(self, loader, force_parse=False, selector='table', raw_html=False):
         self.__loader = loader
         self.__selector = selector
         self.__force_parse = force_parse
         self.__extended_rows = None
         self.__encoding = None
         self.__chars = None
+        self.__extractor = (lambda x: x.html()) if raw_html else (lambda x: x.text())
 
     @property
     def closed(self):
@@ -78,14 +80,11 @@ def __iter_extended_rows(self):
             table.children('tbody').children('tr')
         )
         rows = [pq(r) for r in rows if len(r) > 0]
-        first_row = rows.pop(0)
-        headers = [pq(th).text() for th in first_row.find('th,td')]
-
         # Extract rows
-        rows = [pq(tr).find('td') for tr in rows]
-        rows = [[pq(td).text() for td in tr]
+        rows = [pq(tr).children('td,th') for tr in rows]
+        rows = [[self.__extractor(pq(td)) for td in tr]
                 for tr in rows if len(tr) > 0]
 
         # Yield rows
         for row_number, row in enumerate(rows, start=1):
-            yield (row_number, headers, row)
+            yield (row_number, None, row)
diff --git a/tests/formats/test_html.py b/tests/formats/test_html.py
@@ -26,3 +26,11 @@ def test_stream_html(source, selector):
             {'id': '1', 'name': 'english'},
             {'id': '2', 'name': '中国人'}]
 
+def test_stream_html_raw_html():
+    with Stream('data/table3.html', selector='.mememe', headers=1, encoding='utf8', raw_html=True) as stream:
+        assert stream.headers == ['id', 'name']
+        assert stream.read(keyed=True) == [
+            {'id': '1', 'name': '<b>english</b>'},
+            {'id': '2', 'name': '中国人'}]
+
+