Skip to content

Commit ce9f3ed

Browse files
committed
BUG: Fix pd.read_html handling of rowspan in table header
1 parent 6526829 commit ce9f3ed

File tree

2 files changed

+61
-19
lines changed

2 files changed

+61
-19
lines changed

pandas/io/html.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -454,14 +454,25 @@ def row_is_all_th(row):
454454
while body_rows and row_is_all_th(body_rows[0]):
455455
header_rows.append(body_rows.pop(0))
456456

457-
header = self._expand_colspan_rowspan(header_rows, section="header")
458-
body = self._expand_colspan_rowspan(body_rows, section="body")
459-
footer = self._expand_colspan_rowspan(footer_rows, section="footer")
457+
header, rem = self._expand_colspan_rowspan(header_rows, section="header")
458+
body, rem = self._expand_colspan_rowspan(
459+
body_rows,
460+
section="body",
461+
remainder=rem,
462+
overflow=True if len(footer_rows) > 0 else False,
463+
)
464+
footer, _ = self._expand_colspan_rowspan(
465+
footer_rows, section="footer", remainder=rem, overflow=False
466+
)
460467

461468
return header, body, footer
462469

463470
def _expand_colspan_rowspan(
464-
self, rows, section: Literal["header", "footer", "body"]
471+
self,
472+
rows,
473+
section: Literal["header", "footer", "body"],
474+
remainder: list[int, tuple[str | tuple, int]] | None = None,
475+
overflow: bool = True,
465476
) -> list[list]:
466477
"""
467478
Given a list of <tr>s, return a list of text rows.
@@ -471,6 +482,11 @@ def _expand_colspan_rowspan(
471482
rows : list of node-like
472483
List of <tr>s
473484
section : the section that the rows belong to (header, body or footer).
485+
remainder: list[int, tuple[str | tuple, int]] | None
486+
Any remainder from the expansion of previous section
487+
overflow: bool
488+
If true, return any partial rows as 'remainder'. If not, use up any
489+
partial rows. True by default.
474490
475491
Returns
476492
-------
@@ -485,9 +501,7 @@ def _expand_colspan_rowspan(
485501
"""
486502
all_texts = [] # list of rows, each a list of str
487503
text: str | tuple
488-
remainder: list[
489-
tuple[int, str | tuple, int]
490-
] = [] # list of (index, text, nrows)
504+
remainder = remainder if remainder is not None else []
491505

492506
for tr in rows:
493507
texts = [] # the output for this row
@@ -528,19 +542,20 @@ def _expand_colspan_rowspan(
528542
all_texts.append(texts)
529543
remainder = next_remainder
530544

531-
# Append rows that only appear because the previous row had non-1
532-
# rowspan
533-
while remainder:
534-
next_remainder = []
535-
texts = []
536-
for prev_i, prev_text, prev_rowspan in remainder:
537-
texts.append(prev_text)
538-
if prev_rowspan > 1:
539-
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
540-
all_texts.append(texts)
541-
remainder = next_remainder
545+
if not overflow:
546+
# Append rows that only appear because the previous row had non-1
547+
# rowspan
548+
while remainder:
549+
next_remainder = []
550+
texts = []
551+
for prev_i, prev_text, prev_rowspan in remainder:
552+
texts.append(prev_text)
553+
if prev_rowspan > 1:
554+
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
555+
all_texts.append(texts)
556+
remainder = next_remainder
542557

543-
return all_texts
558+
return all_texts, remainder
544559

545560
def _handle_hidden_tables(self, tbl_list, attr_name: str):
546561
"""

pandas/tests/io/test_html.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):
10041004

10051005
tm.assert_frame_equal(result, expected)
10061006

1007+
def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
1008+
# GH60210
1009+
1010+
result = flavor_read_html(
1011+
StringIO(
1012+
"""
1013+
<table>
1014+
<tr>
1015+
<th rowspan="2">A</th>
1016+
<th>B</th>
1017+
</tr>
1018+
<tr>
1019+
<td>1</td>
1020+
</tr>
1021+
<tr>
1022+
<td>C</td>
1023+
<td>2</td>
1024+
</tr>
1025+
</table>
1026+
"""
1027+
)
1028+
)[0]
1029+
1030+
expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
1031+
1032+
tm.assert_frame_equal(result, expected)
1033+
10071034
def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
10081035
# GH17054
10091036
result = flavor_read_html(

0 commit comments

Comments
 (0)