Skip to content

Commit 47bc400

Browse files
fix: adjust threshold for encoding detection (#894)
* chore: add example doc * fix: adjust encoding recognition threshold value in `detect_file_encoding` * test: add test cases for German characters * chore: update changelog & version
1 parent 52aced8 commit 47bc400

File tree

5 files changed

+18
-6
lines changed

5 files changed

+18
-6
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.8.0-dev1
1+
## 0.8.0-dev2
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
* Add metadata_filename parameter across all partition functions
88

99
### Fixes
10+
* Adjust encoding recognition threshold value in `detect_file_encoding`
1011
* Fix KeyError when `isd_to_elements` doesn't find a type
1112
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk
1213

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>

test_unstructured/partition/test_html_partition.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
DIRECTORY = pathlib.Path(__file__).parent.resolve()
1414

15+
EXPECTED_OUTPUT_LANGUAGE_DE = [
16+
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
17+
]
18+
1519

1620
def test_partition_html_from_filename():
1721
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
@@ -47,14 +51,16 @@ def test_partition_html_from_filename_raises_encoding_error(filename, encoding,
4751

4852
@pytest.mark.parametrize(
4953
"filename",
50-
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
54+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
5155
)
5256
def test_partition_html_from_filename_default_encoding(filename):
5357
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
5458
elements = partition_html(filename=filename_path)
5559
assert len(elements) > 0
5660
for element in elements:
5761
assert element.metadata.filename == filename
62+
if filename == "fake-html-lang-de.html":
63+
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
5864

5965

6066
def test_partition_html_from_filename_metadata_false():
@@ -108,13 +114,15 @@ def test_partition_html_from_file_raises_encoding_error(filename, encoding, erro
108114

109115
@pytest.mark.parametrize(
110116
"filename",
111-
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
117+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
112118
)
113119
def test_partition_html_from_file_default_encoding(filename):
114120
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
115121
with open(filename) as f:
116122
elements = partition_html(file=f)
117123
assert len(elements) > 0
124+
if filename == "fake-html-lang-de.html":
125+
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
118126

119127

120128
@pytest.mark.parametrize(
@@ -133,13 +141,15 @@ def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, e
133141

134142
@pytest.mark.parametrize(
135143
"filename",
136-
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
144+
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
137145
)
138146
def test_partition_html_from_file_rb_default_encoding(filename):
139147
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
140148
with open(filename, "rb") as f:
141149
elements = partition_html(file=f)
142150
assert len(elements) > 0
151+
if filename == "fake-html-lang-de.html":
152+
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
143153

144154

145155
def test_partition_html_from_text():

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.0-dev1" # pragma: no cover
1+
__version__ = "0.8.0-dev2" # pragma: no cover

unstructured/file_utils/encoding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from unstructured.partition.common import convert_to_bytes
66

7-
ENCODE_REC_THRESHOLD = 0.5
7+
ENCODE_REC_THRESHOLD = 0.8
88

99
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
1010
COMMON_ENCODINGS = [

0 commit comments

Comments
 (0)