|
6 | 6 | import csv |
7 | 7 | import pdb |
8 | 8 |
|
9 | | - |
10 | 9 | class Extractor(object): |
11 | | - def __init__(self, table, id_=None, **kwargs): |
12 | | - # input is Tag |
13 | | - if isinstance(table, Tag): |
14 | | - self._table = table.find(id=id_) |
15 | | - # input is str |
16 | | - # for py2, make sure you pass in str |
17 | | - # for py3, everything is str by default |
18 | | - elif isinstance(table, str): |
19 | | - self._table = BeautifulSoup(table, 'html.parser').find(id=id_) |
| 10 | + def __init__(self, input, id_=None, **kwargs): |
| 11 | + # TODO: should divide this class into two subclasses |
| 12 | + # to deal with string and bs4.Tag separately |
| 13 | + |
| 14 | + # validate the input |
| 15 | + if not isinstance(input, str) and not isinstance(input, Tag): |
| 16 | + raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag') |
| 17 | + |
| 18 | + soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input |
| 19 | + |
| 20 | + # locate the target table |
| 21 | + if soup.name == 'table': |
| 22 | + self._table = soup |
20 | 23 | else: |
21 | | - raise Exception('unrecognized type') |
| 24 | + self._table = soup.find(id=id_) |
22 | 25 |
|
23 | 26 | if 'transformer' in kwargs: |
24 | 27 | self._transformer = kwargs['transformer'] |
|
0 commit comments