Skip to content

Commit 547bb38

Browse files
fix: encoding/decoding error with default utf-8 encoding for html, xml, and auto (#660)
Add functionality to try other common encodings for html, xml files if an error related to the encoding is raised and the user has not specified an encoding. Change auto.py to have a None default for encoding Remove the unused parameter encoding from partition_pdf Add functionality to the read_txt_file utility function to handle file-like object from URL
1 parent 7d157c1 commit 547bb38

File tree

17 files changed

+372
-66
lines changed

17 files changed

+372
-66
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
## 0.7.2-dev3
1+
## 0.7.2-dev4
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* Update the `read_txt_file` utility function to keep using `spooled_to_bytes_io_if_needed` for xml
10+
* Add functionality to the `read_txt_file` utility function to handle file-like object from URL
11+
* Remove the unused parameter `encoding` from `partition_pdf`
12+
* Change auto.py to have a `None` default for encoding
13+
* Add functionality to try other common encodings for html and xml files if an error related to the encoding is raised and the user has not specified an encoding.
914
* Adds benchmark test with test docs in example-docs
1015
* Re-enable test_upload_label_studio_data_with_sdk
1116
* File detection now detects code files as plain text
4.69 MB
Binary file not shown.
23.8 KB
Binary file not shown.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
2+
<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>
3+
<meta name='viewport' content='width=device-width, initial-scale=1'> <title > SNB22-3 - SteelJIS - Datasheet, Chemical composition, Standards and Properties </title><meta NAME='Description' CONTENT='SNB22-3 datasheet, chemical composition, standards, properties'><meta name='keywords' content='SNB22-3, SNB22-3 datasheet, SNB22-3 chemical composition, SNB22-3 standards, SNB22-3 properties'><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
4+
<script>
5+
(adsbygoogle = window.adsbygoogle || []).push({
6+
google_ad_client: "ca-pub-4513545675701847",
7+
enable_page_level_ads: true
8+
});
9+
</script>
10+
<META content='SteelJIS(.com)' name=author>
11+
<META name='copyright' content='SteelJIS(.com) Copyright © 2018-2022' >
12+
</head> <style>
13+
body {font-family: Arial, Helvetica, sans-serif; font-size: 14px;
14+
padding: 0; margin:0 auto; }
15+
#tbmenu td{font-size: 14px; font-weight: bold;}
16+
table {border-color: #ccc;}
17+
table a {text-decoration: none;}
18+
a.apodcherk {text-decoration: underline;}
19+
h1 {font-size: 20px; letter-spacing:4pt; font-weight: 700; margin =2px;}
20+
h2 {font-size: 15px; margin = 4px;}
21+
h3 {font-size: 14px; margin = 4px; letter-spacing:1pt;}
22+
td {font-size: 14px;}
23+
li { font-size: 14px;}
24+
.help1 {font-size: 12px;}
25+
.text1 {font-size: 14px; letter-spacing:1pt;}
26+
.btn1 { background: SteelBlue; color: white; font-size: 14px; height: 30px;}
27+
</style><body bgColor='#EFF5EB'> <table align='center' width = '100%' cellspacing='0' cellpadding='1' border='0' bordercolor='#ccc' bgcolor='#5F9EA0'> <tr>
28+
<td align='center' ><font color='#fff'><h1>SteelJIS: &nbsp; Japanese Steels and Alloys</h1> </font></td>
29+
30+
</tr></table><table width= '100%' id='tbmenu' cellspacing='0' cellpadding='7' border='1' bordercolor='#ccc' bgcolor='#e4eeec'><tr align='center'><td> &nbsp; &nbsp; <a href='http://steeljis.com/index.php'> Home </a> &nbsp; &nbsp;
31+
32+
33+
&nbsp; &nbsp; <a href='http://steeljis.com/jis_steel_designation.php' > Japanese steel grading </a> &nbsp; &nbsp;
34+
35+
&nbsp; &nbsp; <a href='http://steeljis.com/jis_steel_standard.php' > Steel standards JIS G </a> &nbsp; &nbsp;
36+
37+
&nbsp; &nbsp; <a href='http://steeljis.com/jis_steel_search.php'> Search </a> &nbsp; &nbsp;
38+
39+
40+
&nbsp; &nbsp; <a href='http://steeljis.com/about.php'> About us </a> &nbsp; &nbsp;
41+
42+
&nbsp; &nbsp; <a href='http://steeljis.com/jp/index.php' target='_blank'> 日本語 </a> &nbsp; &nbsp;
43+
44+
&nbsp; &nbsp; <a href='http://steeljis.com/korea/index.php' target='_blank'><font color='red'> NEW!</font> Korean steels </a> &nbsp; &nbsp;
45+
46+
</td></tr></table><br><table width ='100%' border='0' cellspacing='0' cellpadding='4' ><tr><td> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <a href='http://steeljis.com/jis_steel_designation.php'>Japanese steel grading
47+
</a> &nbsp; -> &nbsp; <a href='http://steeljis.com/jis_steel_designation_grading.php?gr_id=94'>
48+
SNB&nbsp;Grades </a> &nbsp; -> &nbsp; SNB22-3 </td></tr></table><center><h2><font color='#C71585' ><b> SNB22-3 &nbsp;<br>Chemical composition, standards and properties </b></h2></center></font><CENTER><table width ='728' border='1' cellspacing='0' cellpadding='6' bgcolor='#f0f0f0' ></CENTER>
49+
<tr bgColor='#d0e8ff'><td width ='20%'> &nbsp; &nbsp; <b>Grade : </b></td><td > &nbsp; &nbsp; SNB22-3</td></tr><tr ><td> &nbsp; &nbsp; <b>Classification: </b></td><td > &nbsp; &nbsp; Structural steel &nbsp; Chromium molybdenum steel</td></tr><tr><td> &nbsp; &nbsp; <b>Standards: </b></td><td> <table width = '100%' border ='0' bgcolor='#f0f0f0' cellspacing='0' align='center' cellpadding='0' ><tr><td> &nbsp; &nbsp; JIS G 4108 : &nbsp; Alloy steel bars for special application bolting materials &nbsp; </td></tr></table> </td></tr><tr><td> &nbsp; &nbsp; <b>Applications: </b></td><td> &nbsp; &nbsp; Rolled or forget alloy steel bars to be used for the manufacture of the bolts, stud bolts, washers, nuts and other for nuclear reactors and other special uses. </td></tr></table><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
50+
<!-- e_gorme_gog -->
51+
<ins class="adsbygoogle"
52+
style="display:block"
53+
data-ad-client="ca-pub-4513545675701847"
54+
data-ad-slot="3458323770"
55+
data-ad-format="auto"
56+
data-full-width-responsive="true"></ins>
57+
<script>
58+
(adsbygoogle = window.adsbygoogle || []).push({});
59+
</script>
60+
<CENTER><br><br><font class='text1' ><b>Chemical composition 成分 % of grade &nbsp; SNB22-3</b></font></CENTER><CENTER><table width = '730' border = 1 cellspacing='0' cellpadding='4' ></CENTER><tr align='center' bgcolor='#C6E2FF'><td><b> C </b></td><td><b> Si </b></td><td><b>Mn </b></td><td><b> P</b></td><td><b> S</b></td><td><b> Cr</b></td><td><b> Mo</b></td></tr><tr align='center' bgcolor=#f0f0f0><td bgcolor='#f0f0f0'><b>0.39 - 0.46</b></td><td bgcolor='#f0f0f0'><b>0.2 - 0.35</b></td><td bgcolor='#f0f0f0'><b>0.65 - 1.1</b></td><td bgcolor='#f0f0f0'><b><font size='-1'>max</font> &nbsp; 0.025</b></td><td bgcolor='#f0f0f0'><b><font size='-1'>max</font> &nbsp; 0.025</b></td><td bgcolor='#f0f0f0'><b>0.75 - 1.2</b></td><td bgcolor='#f0f0f0'><b>0.15 - 0.25</b></td></tr></table><CENTER><br><br><font class='text1' ><b> Mechanical properties of grade SNB22-3 </b></font></CENTER><table width = '730' border ='1' cellspacing='0' align='center' cellpadding='4' bgcolor='1'><tr align='center' bgcolor='#C6E2FF' >
61+
<td>Assortment</td>
62+
<td width='15%'>Yield point or Proof stress</td>
63+
<td width='15%'>Tensile strength</td>
64+
<td width='15%'>Elongation</td>
65+
<td width='15%'>Reduction of&nbsp;area</td>
66+
<td width='15%'>Charpy impact strength</td>
67+
</tr><tr align='center' bgcolor='#C6E2FF'>
68+
<td>-</td>
69+
<td >N/mm <sup>2</sup> </td>
70+
<td >N/mm <sup>2</sup></td>
71+
<td >%</td>
72+
<td >%</td>
73+
<td >J/cm <sup>2</sup></td>
74+
</tr><tr align='center' bgcolor=#f0f0f0><td><font class='text2'>Bars &nbsp; &nbsp; &nbsp;</font> </td><td><b>890 </b></td><td><b>1000 </b></td><td><b>12 </b></td><td><b>40</b></td><td><b>&nbsp;</b></td></tr></table><CENTER><br><br><font class='text1' ><b>Hardness of grade SNB22-3 </b></font></CENTER><table width ='730' border = 1 cellspacing='0' align='center' cellpadding='4' bgcolor='#f0f0f0'>
75+
<tr align='center' bgcolor='#C6E2FF'><td colrows='2'>-</td><td width ='16%'>Brinell<br>HBW / HB</td><td width ='16%'>Rockwell C<br>HRC</td><td width ='18%'>Rockwell B<br> HRBS / HRB</td><td width ='16%'>Vickers<br>HV</td></tr><tr align='center'><td> SNB22-3 &nbsp; &nbsp; </td><td><b>293-375</b></td><td><b></b></td><td><b></b></td><td><b></b></td></tr></table><CENTER><br><br><font class='text1' ><b> Compare other grades with grade SNB22-3</b></font></CENTER><table width = '730' border ='1' cellspacing='0' align='center' cellpadding='6' bgcolor='#f0f0f0'>
76+
<tr><td align='center' ><b> SNB22-3<br> standards</b>
77+
</td><td> <table width = '100%' border ='0' bgcolor='#f0f0f0' cellspacing='0' align='center' cellpadding='0' ><tr><td> &nbsp; &nbsp; JIS G 4108 : &nbsp; Alloy steel bars for special application bolting materials &nbsp; <a href='http://steeljis.com/jis_steel_standard_grades.php?cl_id=103' > / &nbsp; Compare steels</a> </td></tr></table> </td></tr></table><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
78+
<!-- e_kubm -->
79+
<ins class="adsbygoogle"
80+
style="display:block"
81+
data-ad-client="ca-pub-4513545675701847"
82+
data-ad-slot="6193078572"
83+
data-ad-format="auto"
84+
data-full-width-responsive="true"></ins>
85+
<script>
86+
(adsbygoogle = window.adsbygoogle || []).push({});
87+
</script>
88+
<br><br><table width ='100%' border='0' cellspacing='0' cellpadding='4' ><tr align='center'><td> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <a href='http://steeljis.com/jis_steel_designation.php'>Japanese steel grading
89+
</a> &nbsp; -> &nbsp; <a href='http://steeljis.com/jis_steel_designation_grading.php?gr_id=94'>
90+
SNB&nbsp;Grades </a> &nbsp; -> &nbsp; <a href='http://steeljis.com/jis_steel_datasheet.php?name_id=1073'>SNB22-3</a> </td></tr></table><CENTER><br><br><font class='text1' ><b>Is this information useful? &nbsp; Please share it.</b></font></CENTER><table border='0' cellpadding='0' cellspacing='10' align='center' > <tr align='center'><td ><a href="https://twitter.com/share?ref_src=twsrc%5Etfw" class="twitter-share-button" data-url="http://steeljis.com/index.php" data-show-count="false">Tweet</a><script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
91+
</td><td>
92+
93+
94+
<script src="//platform.linkedin.com/in.js" type="text/javascript"> lang: en_US</script>
95+
<script type="IN/Share" data-url="http://steeljis.com"></script>
96+
97+
98+
</td><td>
99+
100+
101+
<div id="fb-root"></div>
102+
<script>(function(d, s, id) {
103+
var js, fjs = d.getElementsByTagName(s)[0];
104+
if (d.getElementById(id)) return;
105+
js = d.createElement(s); js.id = id;
106+
js.src = 'https://connect.facebook.net/en_US/sdk.js#xfbml=1&version=v3.2';
107+
fjs.parentNode.insertBefore(js, fjs);
108+
}(document, 'script', 'facebook-jssdk'));</script>
109+
110+
<div class="fb-share-button" data-href="http://steeljis.com/" data-layout="button" data-size="small" data-mobile-iframe="true"><a target="_blank" href="https://www.facebook.com/sharer/sharer.php?u=http%3A%2F%2Fwww.steeljis.com%2F&amp;src=sdkpreparse" class="fb-xfbml-parse-ignore">Share</a></div>
111+
112+
113+
</td></tr></table><CENTER><br><br><font class='text1' ><b>Mechanical properties <br> Mechanische Eigenschaften<br>Caracteristiques mecaniques</b></font></CENTER><table width = '610' align=center border = 0 cellspacing='1' cellpadding='1' >
114+
<tr><td> &bull; Minimum yield strength /
115+
Mindestwert der oberen Streckgrenze / Limite dТelasticite minimale </td></tr>
116+
<tr><td> &bull; Tensile strength / Zugfestigkeit / Resistance a la traction </td></tr>
117+
<tr><td> &bull; Minimum elongation /
118+
Mindestwert der Bruchdehnung / Allongement minimal </td></tr>
119+
</table></font><br><br><center><table width='80%' align='center' border='0'><tr><td align='center'>SNB22-3 datasheet, SNB22-3 mechanical properties, SNB22-3 technical specifications. Chemical composition of Japanese steel SNB22-3. Standards of SNB22-3. Tensile Strength of SNB22-3. Elongation of SNB22-3. Density of SNB22-3.
120+
Brinell, Rockwell, Vickers hardness of SNB22-3
121+
</td><tr><table><br><br><table width = '100%' cellspacing='0' cellpadding='2' border='0' bgcolor='#5F9EA0' ><tr> <td align='center'>
122+
<b>
123+
<a href='http://steeljis.com'>SteelJIS:</a></b> &nbsp; Japanese Steels and Alloys free searchable database &nbsp; &nbsp; &copy; &nbsp; 2018-2022 &nbsp; All rights reserved. &nbsp; &nbsp; &nbsp;<br> The contents from this site may not be reproduced. <br> The entire risk as to use of these content is assumed by you the user</CENTER>
124+
</td> </tr> </table></center></center></center>
125+
<!--LiveInternet counter--><script type="text/javascript">
126+
document.write("<a href='//www.liveinternet.ru/click' "+
127+
"target=_blank><img src='//counter.yadro.ru/hit?t44.12;r"+
128+
escape(document.referrer)+((typeof(screen)=="undefined")?"":
129+
";s"+screen.width+"*"+screen.height+"*"+(screen.colorDepth?
130+
screen.colorDepth:screen.pixelDepth))+";u"+escape(document.URL)+
131+
";h"+escape(document.title.substring(0,150))+";"+Math.random()+
132+
"' alt='' title='LiveInternet' "+
133+
"border='0' width='31' height='31'><\/a>")
134+
</script><!--/LiveInternet-->
135+
136+
137+
138+
139+
140+
<script type="text/javascript">
141+
function addLink() {
142+
var body_element = document.getElementsByTagName('body')[0];
143+
var selection;
144+
selection = window.getSelection();
145+
var pagelink = "<br /><br /> <a href='"+document.location.href+"'>"+document.location.href+"</a>"; // source
146+
var copytext = selection + pagelink;
147+
var newdiv = document.createElement('div');
148+
newdiv.style.position='absolute';
149+
newdiv.style.left='-99999px';
150+
body_element.appendChild(newdiv);
151+
newdiv.innerHTML = copytext;
152+
selection.selectAllChildren(newdiv);
153+
window.setTimeout(function() {
154+
body_element.removeChild(newdiv);
155+
},0);
156+
}
157+
document.oncopy = addLink;
158+
</script>

example-docs/factbook-utf-16.xml

1.31 KB
Binary file not shown.

test_unstructured/partition/test_auto.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,6 @@ def test_auto_partition_pdf_with_fast_strategy():
319319
file=None,
320320
url=None,
321321
include_page_breaks=False,
322-
encoding="utf-8",
323322
infer_table_structure=False,
324323
strategy="fast",
325324
ocr_languages="eng",

test_unstructured/partition/test_html_partition.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,30 @@ def test_partition_html_from_filename():
2222
assert elements[0].metadata.file_directory == directory
2323

2424

25+
@pytest.mark.parametrize(
26+
("filename", "encoding", "error"),
27+
[
28+
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
29+
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
30+
],
31+
)
32+
def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error):
33+
with pytest.raises(error):
34+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
35+
with open(filename) as f:
36+
partition_html(file=f, encoding=encoding)
37+
38+
39+
@pytest.mark.parametrize(
40+
"filename",
41+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
42+
)
43+
def test_partition_html_from_filename_default_encoding(filename):
44+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
45+
elements = partition_html(filename=filename)
46+
assert len(elements) > 0
47+
48+
2549
def test_partition_html_from_filename_metadata_false():
2650
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
2751
filename = os.path.join(directory, "example-10k.html")
@@ -44,6 +68,56 @@ def test_partition_html_from_file():
4468
assert len(elements) > 0
4569

4670

71+
@pytest.mark.parametrize(
72+
("filename", "encoding", "error"),
73+
[
74+
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
75+
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
76+
],
77+
)
78+
def test_partition_html_from_file_raises_encoding_error(filename, encoding, error):
79+
with pytest.raises(error):
80+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
81+
with open(filename) as f:
82+
partition_html(file=f, encoding=encoding)
83+
84+
85+
@pytest.mark.parametrize(
86+
"filename",
87+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
88+
)
89+
def test_partition_html_from_file_default_encoding(filename):
90+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
91+
with open(filename) as f:
92+
elements = partition_html(file=f)
93+
assert len(elements) > 0
94+
95+
96+
@pytest.mark.parametrize(
97+
("filename", "encoding", "error"),
98+
[
99+
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
100+
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
101+
],
102+
)
103+
def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error):
104+
with pytest.raises(error):
105+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
106+
with open(filename, "rb") as f:
107+
partition_html(file=f, encoding=encoding)
108+
109+
110+
@pytest.mark.parametrize(
111+
"filename",
112+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
113+
)
114+
def test_partition_html_from_file_rb_default_encoding(filename):
115+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
116+
with open(filename, "rb") as f:
117+
elements = partition_html(file=f)
118+
assert len(elements) > 0
119+
120+
47121
def test_partition_html_from_text():
48122
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
49123
with open(filename) as f:

0 commit comments

Comments
 (0)