File tree Expand file tree Collapse file tree 2 files changed +48
-1
lines changed
Expand file tree Collapse file tree 2 files changed +48
-1
lines changed Original file line number Diff line number Diff line change @@ -605,7 +605,13 @@ def _build_doc(self):
605605 else :
606606 udoc = bdoc
607607 from_encoding = self .encoding
608- return BeautifulSoup (udoc , features = "html5lib" , from_encoding = from_encoding )
608+
609+ soup = BeautifulSoup (udoc , features = "html5lib" , from_encoding = from_encoding )
610+
611+ for br in soup .find_all ("br" ):
612+ br .replace_with ("\n " + br .text )
613+
614+ return soup
609615
610616
611617def _build_xpath_expr (attrs ) -> str :
@@ -746,6 +752,10 @@ def _build_doc(self):
746752 else :
747753 if not hasattr (r , "text_content" ):
748754 raise XMLSyntaxError ("no text parsed from document" , 0 , 0 , 0 )
755+
756+ for br in r .xpath ("*//br" ):
757+ br .tail = "\n " + (br .tail or '' )
758+
749759 return r
750760
751761 def _parse_thead_tr (self , table ):
Original file line number Diff line number Diff line change @@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath):
12731273 df1 = self .read_html (file_path_string )[0 ]
12741274 df2 = self .read_html (file_path )[0 ]
12751275 tm .assert_frame_equal (df1 , df2 )
1276+
1277+ def test_parse_br_as_space (self ):
1278+ # GH 29528: pd.read_html() convert <br> to space
1279+ result = self .read_html ("""
1280+ <table>
1281+ <tr>
1282+ <th>A</th>
1283+ </tr>
1284+ <tr>
1285+ <td>world1<br>word2</td>
1286+ </tr>
1287+ </table>
1288+ """ )[0 ]
1289+
1290+ expected = DataFrame (data = [["word1 word2" ]], columns = ["A" ])
1291+
1292+ tm .assert_frame_equal (result , expected )
1293+
1294+ def test_parse_br_tail_retained (self ):
1295+ # Ensure text after br are retained when they are replaced with a space.
1296+ # See:
1297+ # https://stackoverflow.com/q/33281217 and
1298+ # https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
1299+ result = self .read_html ("""
1300+ <table>
1301+ <tr>
1302+ <th>A</th>
1303+ </tr>
1304+ <tr>
1305+ <td>world1<br>word2</td>
1306+ </tr>
1307+ </table>
1308+ """ )[0 ]
1309+
1310+ expected = DataFrame (data = [["word1 word2" ]], columns = ["A" ])
1311+
1312+ tm .assert_frame_equal (result , expected )
You can’t perform that action at this time.
0 commit comments