Skip to content

Commit d1b3ac2

Browse files
committed
Handle the case where the passed in str is a table itself
1 parent 0002e0b commit d1b3ac2

File tree

2 files changed

+21
-15
lines changed

2 files changed

+21
-15
lines changed

html_table_extractor/extractor.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,22 @@
66
import csv
77
import pdb
88

9-
109
class Extractor(object):
11-
def __init__(self, table, id_=None, **kwargs):
12-
# input is Tag
13-
if isinstance(table, Tag):
14-
self._table = table.find(id=id_)
15-
# input is str
16-
# for py2, make sure you pass in str
17-
# for py3, everything is str by default
18-
elif isinstance(table, str):
19-
self._table = BeautifulSoup(table, 'html.parser').find(id=id_)
10+
def __init__(self, input, id_=None, **kwargs):
11+
# TODO: should divide this class into two subclasses
12+
# to deal with string and bs4.Tag separately
13+
14+
# validate the input
15+
if not isinstance(input, str) and not isinstance(input, Tag):
16+
raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag')
17+
18+
soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input
19+
20+
# locate the target table
21+
if soup.name == 'table':
22+
self._table = soup
2023
else:
21-
raise Exception('unrecognized type')
24+
self._table = soup.find(id=id_)
2225

2326
if 'transformer' in kwargs:
2427
self._transformer = kwargs['transformer']

tests/tests.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def test_config_transformer(self):
6363
class TestPassId(unittest.TestCase):
6464
def test_init_with_id(self):
6565
html = """
66-
<table id='wanted'>
66+
<body>
67+
<table id='wanted'>
6768
<tr>
6869
<td>1</td>
6970
<td>2</td>
@@ -72,12 +73,13 @@ def test_init_with_id(self):
7273
<td>3</td>
7374
<td>4</td>
7475
</tr>
75-
</table>
76-
<table id='unwanted'>
76+
</table>
77+
<table id='unwanted'>
7778
<tr>
7879
<td>unwanted</td>
7980
</tr>
80-
</table>
81+
</table>
82+
</body>
8183
"""
8284
soup = BeautifulSoup(html, 'html.parser')
8385
extractor = Extractor(soup, id_='wanted').parse()
@@ -86,6 +88,7 @@ def test_init_with_id(self):
8688
[[u'1', u'2'], [u'3', u'4']]
8789
)
8890

91+
8992
class TestComplexExtractor(unittest.TestCase):
9093
def setUp(self):
9194
html = """

0 commit comments

Comments
 (0)