Skip to content

Commit d24dec5

Browse files
add '|' as a delimiter in csv files (#4059)
This PR fixes the error “Failure to process CSV: Expected 2 fields in line 2, saw 4” when '|' is used as a delimiter in the csv file
1 parent a040483 commit d24dec5

File tree

6 files changed

+34
-2
lines changed

6 files changed

+34
-2
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.11-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Recognize '|' as a delimiter** csv parser will now recognize '|' as a delimiter in addition to ',' and ';'.
9+
110
## 0.18.10
211

312
### Enhancements
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
col1|col2|col3
2+
a|b|c
3+
d|e|f
4+
g|h|i

test_unstructured/partition/test_constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@
3737
"</table>"
3838
)
3939

40+
EXPECTED_TABLE_WITH_LINE_DELIMITER = (
41+
"<table>"
42+
"<tr><td>col1</td><td>col2</td><td>col3</td></tr>"
43+
"<tr><td>a</td><td>b</td><td>c</td></tr>"
44+
"<tr><td>d</td><td>e</td><td>f</td></tr>"
45+
"<tr><td>g</td><td>h</td><td>i</td></tr>"
46+
"</table>"
47+
)
48+
49+
4050
EXPECTED_TITLE = "Stanley Cups"
4151

4252
EXPECTED_TEXT = (
@@ -54,6 +64,8 @@
5464
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
5565
)
5666

67+
EXPECTED_TEXT_WITH_LINE_DELIMITER = "col1 col2 col3 a b c d e f g h i"
68+
5769
EXPECTED_XLS_TABLE = (
5870
"<table><tr>"
5971
"<td>MC</td>"

test_unstructured/partition/test_csv.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
EXPECTED_TABLE,
1212
EXPECTED_TABLE_SEMICOLON_DELIMITER,
1313
EXPECTED_TABLE_WITH_EMOJI,
14+
EXPECTED_TABLE_WITH_LINE_DELIMITER,
1415
EXPECTED_TEXT,
1516
EXPECTED_TEXT_SEMICOLON_DELIMITER,
1617
EXPECTED_TEXT_WITH_EMOJI,
18+
EXPECTED_TEXT_WITH_LINE_DELIMITER,
1719
EXPECTED_TEXT_XLSX,
1820
)
1921
from test_unstructured.unit_utils import (
@@ -42,6 +44,11 @@
4244
EXPECTED_TEXT_SEMICOLON_DELIMITER,
4345
EXPECTED_TABLE_SEMICOLON_DELIMITER,
4446
),
47+
(
48+
"csv-with-line-delimiter.csv",
49+
EXPECTED_TEXT_WITH_LINE_DELIMITER,
50+
EXPECTED_TABLE_WITH_LINE_DELIMITER,
51+
),
4552
],
4653
)
4754
def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.10" # pragma: no cover
1+
__version__ = "0.18.11-dev0" # pragma: no cover

unstructured/partition/csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def delimiter(self) -> str | None:
127127
)
128128

129129
try:
130-
return sniffer.sniff(data, delimiters=",;").delimiter
130+
return sniffer.sniff(data, delimiters=",;|").delimiter
131131
except csv.Error:
132132
# -- sniffing will fail on single-column csv as no default can be assumed --
133133
return None

0 commit comments

Comments
 (0)