Skip to content

Commit 8994531

Browse files
committed
Improving backward CSV parsing resilience.
1 parent 1a38261 commit 8994531

File tree

4 files changed

+170
-92
lines changed

4 files changed

+170
-92
lines changed

awswrangler/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
22
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.0b27"
3+
__version__ = "0.0b28"
44
__license__ = "Apache License 2.0"

awswrangler/pandas.py

Lines changed: 84 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -186,31 +186,33 @@ def _read_csv_iterator(
186186
end -= 1 # Range is inclusive, contrary to Python's List
187187
bytes_range = "bytes={}-{}".format(ini, end)
188188
logger.debug(f"bytes_range: {bytes_range}")
189-
body = client_s3.get_object(Bucket=bucket_name, Key=key_path, Range=bytes_range)["Body"]\
190-
.read()\
191-
.decode("utf-8")
189+
body = client_s3.get_object(Bucket=bucket_name,
190+
Key=key_path,
191+
Range=bytes_range)["Body"].read()
192192
chunk_size = len(body)
193-
logger.debug(f"chunk_size: {chunk_size}")
193+
logger.debug(f"chunk_size (bytes): {chunk_size}")
194194

195195
if count == 1: # first chunk
196196
last_char = Pandas._find_terminator(
197197
body=body,
198+
sep=sep,
198199
quoting=quoting,
199200
quotechar=quotechar,
200201
lineterminator=lineterminator)
201-
forgotten_bytes = len(body[last_char:].encode("utf-8"))
202+
forgotten_bytes = len(body[last_char:])
202203
elif count == bounders_len: # Last chunk
203204
last_char = chunk_size
204205
else:
205206
last_char = Pandas._find_terminator(
206207
body=body,
208+
sep=sep,
207209
quoting=quoting,
208210
quotechar=quotechar,
209211
lineterminator=lineterminator)
210-
forgotten_bytes = len(body[last_char:].encode("utf-8"))
212+
forgotten_bytes = len(body[last_char:])
211213

212214
df = pandas.read_csv(
213-
StringIO(body[:last_char]),
215+
StringIO(body[:last_char].decode("utf-8")),
214216
header=header,
215217
names=names,
216218
sep=sep,
@@ -229,57 +231,95 @@ def _read_csv_iterator(
229231
header = None
230232

231233
@staticmethod
232-
def _find_terminator(body, quoting, quotechar, lineterminator):
234+
def _extract_terminator_profile(body, sep, quotechar, lineterminator,
235+
last_index):
236+
"""
237+
Backward parser for quoted CSV lines
238+
:param body: String
239+
:param sep: Same as pandas.read_csv()
240+
:param quotechar: Same as pandas.read_csv()
241+
:param lineterminator: Same as pandas.read_csv()
242+
:return: Dict with the profile
243+
"""
244+
sep_int = int.from_bytes(bytes=sep.encode(encoding="utf-8"),
245+
byteorder="big") # b"," -> 44
246+
quote_int = int.from_bytes(bytes=quotechar.encode(encoding="utf-8"),
247+
byteorder="big") # b'"' -> 34
248+
terminator_int = int.from_bytes(
249+
bytes=lineterminator.encode(encoding="utf-8"),
250+
byteorder="big") # b"\n" -> 10
251+
logger.debug(f"sep_int: {sep_int}")
252+
logger.debug(f"quote_int: {quote_int}")
253+
logger.debug(f"terminator_int: {terminator_int}")
254+
last_terminator_suspect_index = None
255+
first_non_special_byte_index = None
256+
sep_counter = 0
257+
quote_counter = 0
258+
for i in range((len(body[:last_index]) - 1), -1, -1):
259+
b = body[i]
260+
if last_terminator_suspect_index:
261+
if b == quote_int:
262+
quote_counter += 1
263+
elif b == sep_int:
264+
sep_counter += 1
265+
elif b == terminator_int:
266+
pass
267+
else:
268+
first_non_special_byte_index = i
269+
break
270+
if b == terminator_int:
271+
if not last_terminator_suspect_index:
272+
last_terminator_suspect_index = i
273+
elif last_terminator_suspect_index - 1 == i:
274+
first_non_special_byte_index = i
275+
break
276+
logger.debug(
277+
f"last_terminator_suspect_index: {last_terminator_suspect_index}")
278+
logger.debug(
279+
f"first_non_special_byte_index: {first_non_special_byte_index}")
280+
logger.debug(f"sep_counter: {sep_counter}")
281+
logger.debug(f"quote_counter: {quote_counter}")
282+
return {
283+
"last_terminator_suspect_index": last_terminator_suspect_index,
284+
"first_non_special_byte_index": first_non_special_byte_index,
285+
"sep_counter": sep_counter,
286+
"quote_counter": quote_counter
287+
}
288+
289+
@staticmethod
290+
def _find_terminator(body, sep, quoting, quotechar, lineterminator):
233291
"""
234292
Find for any suspicious of line terminator (From end to start)
235293
:param body: String
294+
:param sep: Same as pandas.read_csv()
236295
:param quoting: Same as pandas.read_csv()
237296
:param quotechar: Same as pandas.read_csv()
238297
:param lineterminator: Same as pandas.read_csv()
239298
:return: The index of the suspect line terminator
240299
"""
241300
try:
301+
last_index = None
242302
if quoting == csv.QUOTE_ALL:
243-
index = body.rindex(lineterminator)
244303
while True:
245-
i = 0
246-
while True:
247-
i += 1
248-
if index + i <= len(body) - 1:
249-
c = body[index + i]
250-
if c == ",":
251-
pass
252-
elif c == quotechar:
253-
right = True
254-
break
255-
else:
256-
right = False
257-
break
258-
else:
259-
right = True
260-
break
261-
i = 0
262-
while True:
263-
i += 1
264-
if index - i >= 0:
265-
c = body[index - i]
266-
if c == ",":
267-
pass
268-
elif c == quotechar:
269-
left = True
270-
break
271-
else:
272-
left = False
273-
break
304+
profile = Pandas._extract_terminator_profile(
305+
body=body,
306+
sep=sep,
307+
quotechar=quotechar,
308+
lineterminator=lineterminator,
309+
last_index=last_index)
310+
if profile["last_terminator_suspect_index"] and profile[
311+
"first_non_special_byte_index"]:
312+
if profile["quote_counter"] % 2 == 0 or profile[
313+
"quote_counter"] == 0:
314+
last_index = profile[
315+
"last_terminator_suspect_index"]
274316
else:
275-
left = True
317+
index = profile["last_terminator_suspect_index"]
276318
break
277-
278-
if right and left:
279-
break
280-
index = body[:index].rindex(lineterminator)
319+
else:
320+
raise LineTerminatorNotFound()
281321
else:
282-
index = body.rindex(lineterminator)
322+
index = body.rindex(lineterminator.encode(encoding="utf-8"))
283323
except ValueError:
284324
raise LineTerminatorNotFound()
285325
return index

data_samples/complex.csv

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ BOO
77
BAR
88
xxxxx
99
ÁÃÀÂÇ
10-
zzzzz"
10+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
1111
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","","foo
1212
boo
1313
bar
@@ -16,7 +16,7 @@ BOO
1616
BAR
1717
xxxxx
1818
ÁÃÀÂÇ
19-
zzzzz"
19+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
2020
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
2121
boo
2222
bar
@@ -25,7 +25,7 @@ BOO
2525
BAR
2626
xxxxx
2727
ÁÃÀÂÇ
28-
zzzzz"
28+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
2929
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","","foo
3030
boo
3131
bar
@@ -34,7 +34,7 @@ BOO
3434
BAR
3535
xxxxx
3636
ÁÃÀÂÇ
37-
zzzzz"
37+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
3838
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
3939
boo
4040
bar
@@ -43,7 +43,7 @@ BOO
4343
BAR
4444
xxxxx
4545
ÁÃÀÂÇ
46-
zzzzz"
46+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
4747
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","","foo
4848
boo
4949
bar
@@ -52,7 +52,7 @@ BOO
5252
BAR
5353
xxxxx
5454
ÁÃÀÂÇ
55-
zzzzz"
55+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
5656
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
5757
boo
5858
bar
@@ -61,7 +61,7 @@ BOO
6161
BAR
6262
xxxxx
6363
ÁÃÀÂÇ
64-
zzzzz"
64+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
6565
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
6666
boo
6767
bar
@@ -70,7 +70,7 @@ BOO
7070
BAR
7171
xxxxx
7272
ÁÃÀÂÇ
73-
zzzzz"
73+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
7474
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
7575
boo
7676
bar
@@ -79,7 +79,7 @@ BOO
7979
BAR
8080
xxxxx
8181
ÁÃÀÂÇ
82-
zzzzz"
82+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
8383
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
8484
boo
8585
bar
@@ -88,7 +88,7 @@ BOO
8888
BAR
8989
xxxxx
9090
ÁÃÀÂÇ
91-
zzzzz"
91+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
9292
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789",,"foo
9393
boo
9494
bar
@@ -97,7 +97,7 @@ BOO
9797
BAR
9898
xxxxx
9999
ÁÃÀÂÇ
100-
zzzzz"
100+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
101101
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
102102
boo
103103
bar
@@ -106,7 +106,7 @@ BOO
106106
BAR
107107
xxxxx
108108
ÁÃÀÂÇ
109-
zzzzz"
109+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
110110
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
111111
boo
112112
bar
@@ -115,7 +115,7 @@ BOO
115115
BAR
116116
xxxxx
117117
ÁÃÀÂÇ
118-
zzzzz"
118+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
119119
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
120120
boo
121121
bar
@@ -124,7 +124,7 @@ BOO
124124
BAR
125125
xxxxx
126126
ÁÃÀÂÇ
127-
zzzzz"
127+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
128128
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
129129
boo
130130
bar
@@ -133,7 +133,7 @@ BOO
133133
BAR
134134
xxxxx
135135
ÁÃÀÂÇ
136-
zzzzz"
136+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
137137
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
138138
boo
139139
bar
@@ -142,7 +142,7 @@ BOO
142142
BAR
143143
xxxxx
144144
ÁÃÀÂÇ
145-
zzzzz"
145+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
146146
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
147147
boo
148148
bar
@@ -151,7 +151,7 @@ BOO
151151
BAR
152152
xxxxx
153153
ÁÃÀÂÇ
154-
zzzzz"
154+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
155155
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
156156
boo
157157
bar
@@ -160,7 +160,7 @@ BOO
160160
BAR
161161
xxxxx
162162
ÁÃÀÂÇ
163-
zzzzz"
163+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
164164
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
165165
boo
166166
bar
@@ -169,7 +169,7 @@ BOO
169169
BAR
170170
xxxxx
171171
ÁÃÀÂÇ
172-
zzzzz"
172+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"
173173
"2018-01-01 04:03:02.001","2019-02-02","12345.6789","123456789","1","foo
174174
boo
175175
bar
@@ -178,4 +178,4 @@ BOO
178178
BAR
179179
xxxxx
180180
ÁÃÀÂÇ
181-
zzzzz"
181+
汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå汉字汉字汉字汉字汉字汉字汉字æøåæøåæøåæøåæøåæøåæøåæøåæøåæøå"

0 commit comments

Comments
 (0)