Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 32 additions & 38 deletions pandas/tests/io/parser/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
)
import os
import tempfile
import uuid

import numpy as np
import pytest
Expand Down Expand Up @@ -51,28 +50,26 @@ def test_read_csv_unicode(all_parsers):
@skip_pyarrow
@pytest.mark.parametrize("sep", [",", "\t"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
def test_utf16_bom_skiprows(all_parsers, sep, encoding, temp_file):
# see gh-2298
parser = all_parsers
data = """skip this
skip this too
A,B,C
1,2,3
4,5,6""".replace(",", sep)
path = f"__{uuid.uuid4()}__.csv"
kwargs = {"sep": sep, "skiprows": 2}
utf8 = "utf-8"

with tm.ensure_clean(path) as path:
bytes_data = data.encode(encoding)
bytes_data = data.encode(encoding)

with open(path, "wb") as f:
f.write(bytes_data)
with open(temp_file, "wb") as f:
f.write(bytes_data)

with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
result = parser.read_csv(path, encoding=encoding, **kwargs)
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
result = parser.read_csv(temp_file, encoding=encoding, **kwargs)
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)


def test_utf16_example(all_parsers, csv_dir_path):
Expand Down Expand Up @@ -240,7 +237,7 @@ def test_parse_encoded_special_characters(encoding):


@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
def test_encoding_memory_map(all_parsers, encoding):
def test_encoding_memory_map(all_parsers, encoding, temp_file):
# GH40986
parser = all_parsers
expected = DataFrame(
Expand All @@ -250,20 +247,19 @@ def test_encoding_memory_map(all_parsers, encoding):
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
}
)
with tm.ensure_clean() as file:
expected.to_csv(file, index=False, encoding=encoding)
expected.to_csv(temp_file, index=False, encoding=encoding)

if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(file, encoding=encoding, memory_map=True)
return
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(temp_file, encoding=encoding, memory_map=True)
return

df = parser.read_csv(file, encoding=encoding, memory_map=True)
df = parser.read_csv(temp_file, encoding=encoding, memory_map=True)
tm.assert_frame_equal(df, expected)


def test_chunk_splits_multibyte_char(all_parsers):
def test_chunk_splits_multibyte_char(all_parsers, temp_file):
"""
Chunk splits a multibyte character with memory_map=True

Expand All @@ -276,20 +272,19 @@ def test_chunk_splits_multibyte_char(all_parsers):
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
# utf-8 encoding of "ą" is b'\xc4\x85'
df.iloc[2047] = "a" * 127 + "ą"
with tm.ensure_clean("bug-gh43540.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
df.to_csv(temp_file, index=False, header=False, encoding="utf-8")

if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True)
return
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(temp_file, header=None, memory_map=True)
return

dfr = parser.read_csv(fname, header=None, memory_map=True)
dfr = parser.read_csv(temp_file, header=None, memory_map=True)
tm.assert_frame_equal(dfr, df)


def test_readcsv_memmap_utf8(all_parsers):
def test_readcsv_memmap_utf8(all_parsers, temp_file):
"""
GH 43787

Expand All @@ -310,16 +305,15 @@ def test_readcsv_memmap_utf8(all_parsers):
lines.append(line)
parser = all_parsers
df = DataFrame(lines)
with tm.ensure_clean("utf8test.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
df.to_csv(temp_file, index=False, header=False, encoding="utf-8")

if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
return
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(temp_file, header=None, memory_map=True, encoding="utf-8")
return

dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
dfr = parser.read_csv(temp_file, header=None, memory_map=True, encoding="utf-8")
tm.assert_frame_equal(df, dfr)


Expand Down
45 changes: 28 additions & 17 deletions pandas/tests/io/xml/test_xml_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@ def iterparse(request):
return request.param


def read_xml_iterparse(data, **kwargs):
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write(data)
return read_xml(path, **kwargs)
def read_xml_iterparse(data, temp_file, **kwargs):
with open(temp_file, "w", encoding="utf-8") as f:
f.write(data)
return read_xml(temp_file, **kwargs)


xml_types = """\
Expand Down Expand Up @@ -84,13 +83,14 @@ def read_xml_iterparse(data, **kwargs):
# DTYPE


def test_dtype_single_str(parser):
def test_dtype_single_str(parser, temp_file):
df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser)
df_iter = read_xml_iterparse(
xml_types,
parser=parser,
dtype={"degrees": "str"},
iterparse={"row": ["shape", "degrees", "sides"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -105,13 +105,14 @@ def test_dtype_single_str(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_dtypes_all_str(parser):
def test_dtypes_all_str(parser, temp_file):
df_result = read_xml(StringIO(xml_dates), dtype="string", parser=parser)
df_iter = read_xml_iterparse(
xml_dates,
parser=parser,
dtype="string",
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -128,7 +129,7 @@ def test_dtypes_all_str(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_dtypes_with_names(parser):
def test_dtypes_with_names(parser, temp_file):
df_result = read_xml(
StringIO(xml_dates),
names=["Col1", "Col2", "Col3", "Col4"],
Expand All @@ -141,6 +142,7 @@ def test_dtypes_with_names(parser):
names=["Col1", "Col2", "Col3", "Col4"],
dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64[ns]"},
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -158,13 +160,14 @@ def test_dtypes_with_names(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_dtype_nullable_int(parser):
def test_dtype_nullable_int(parser, temp_file):
df_result = read_xml(StringIO(xml_types), dtype={"sides": "Int64"}, parser=parser)
df_iter = read_xml_iterparse(
xml_types,
parser=parser,
dtype={"sides": "Int64"},
iterparse={"row": ["shape", "degrees", "sides"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -179,13 +182,14 @@ def test_dtype_nullable_int(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_dtype_float(parser):
def test_dtype_float(parser, temp_file):
df_result = read_xml(StringIO(xml_types), dtype={"degrees": "float"}, parser=parser)
df_iter = read_xml_iterparse(
xml_types,
parser=parser,
dtype={"degrees": "float"},
iterparse={"row": ["shape", "degrees", "sides"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -209,7 +213,7 @@ def test_wrong_dtype(xml_books, parser, iterparse):
)


def test_both_dtype_converters(parser):
def test_both_dtype_converters(parser, temp_file):
df_expected = DataFrame(
{
"shape": ["square", "circle", "triangle"],
Expand All @@ -231,6 +235,7 @@ def test_both_dtype_converters(parser):
converters={"degrees": str},
parser=parser,
iterparse={"row": ["shape", "degrees", "sides"]},
temp_file=temp_file,
)

tm.assert_frame_equal(df_result, df_expected)
Expand All @@ -240,7 +245,7 @@ def test_both_dtype_converters(parser):
# CONVERTERS


def test_converters_str(parser):
def test_converters_str(parser, temp_file):
df_result = read_xml(
StringIO(xml_types), converters={"degrees": str}, parser=parser
)
Expand All @@ -249,6 +254,7 @@ def test_converters_str(parser):
parser=parser,
converters={"degrees": str},
iterparse={"row": ["shape", "degrees", "sides"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -263,7 +269,7 @@ def test_converters_str(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_converters_date(parser):
def test_converters_date(parser, temp_file):
convert_to_datetime = lambda x: to_datetime(x)
df_result = read_xml(
StringIO(xml_dates), converters={"date": convert_to_datetime}, parser=parser
Expand All @@ -273,6 +279,7 @@ def test_converters_date(parser):
parser=parser,
converters={"date": convert_to_datetime},
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand Down Expand Up @@ -312,13 +319,14 @@ def test_callable_str_converters(xml_books, parser, iterparse):
# PARSE DATES


def test_parse_dates_column_name(parser):
def test_parse_dates_column_name(parser, temp_file):
df_result = read_xml(StringIO(xml_dates), parse_dates=["date"], parser=parser)
df_iter = read_xml_iterparse(
xml_dates,
parser=parser,
parse_dates=["date"],
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -334,13 +342,14 @@ def test_parse_dates_column_name(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_parse_dates_column_index(parser):
def test_parse_dates_column_index(parser, temp_file):
df_result = read_xml(StringIO(xml_dates), parse_dates=[3], parser=parser)
df_iter = read_xml_iterparse(
xml_dates,
parser=parser,
parse_dates=[3],
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -356,14 +365,15 @@ def test_parse_dates_column_index(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_parse_dates_true(parser):
def test_parse_dates_true(parser, temp_file):
df_result = read_xml(StringIO(xml_dates), parse_dates=True, parser=parser)

df_iter = read_xml_iterparse(
xml_dates,
parser=parser,
parse_dates=True,
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

df_expected = DataFrame(
Expand All @@ -379,7 +389,7 @@ def test_parse_dates_true(parser):
tm.assert_frame_equal(df_iter, df_expected)


def test_day_first_parse_dates(parser):
def test_day_first_parse_dates(parser, temp_file):
xml = """\
<?xml version='1.0' encoding='utf-8'?>
<data>
Expand Down Expand Up @@ -421,6 +431,7 @@ def test_day_first_parse_dates(parser):
parse_dates=["date"],
parser=parser,
iterparse={"row": ["shape", "degrees", "sides", "date"]},
temp_file=temp_file,
)

tm.assert_frame_equal(df_result, df_expected)
Expand Down
Loading