Skip to content

Commit 2cac1bb

Browse files
committed
Passing test using both to_csv and read_csv
1 parent f03ff7c commit 2cac1bb

File tree

2 files changed

+62
-33
lines changed

2 files changed

+62
-33
lines changed

pandas/io/parsers/readers.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import warnings
2626

2727
import numpy as np
28+
import pandas as pd
29+
import json
2830

2931
from pandas._libs import lib
3032
from pandas._libs.parsers import STR_NA_VALUES
@@ -831,6 +833,7 @@ def read_csv(
831833
memory_map: bool = False,
832834
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
833835
storage_options: StorageOptions | None = None,
836+
preserve_complex: bool = False,
834837
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
835838
) -> DataFrame | TextFileReader:
836839
# locals() should never be modified
@@ -850,7 +853,33 @@ def read_csv(
850853
)
851854
kwds.update(kwds_defaults)
852855

853-
return _read(filepath_or_buffer, kwds)
856+
df_or_reader = _read(filepath_or_buffer, kwds)
857+
# If DataFrame, parse columns containing JSON arrays if preserve_complex=True
858+
if preserve_complex and isinstance(df_or_reader, DataFrame):
859+
_restore_complex_arrays(df_or_reader)
860+
861+
return df_or_reader
862+
863+
864+
def _restore_complex_arrays(df: DataFrame) -> None:
865+
"""
866+
Loop over each column of df, check if it contains bracketed JSON strings
867+
like "[0.1, 0.2, 0.3]", and parse them back into NumPy arrays.
868+
"""
869+
def looks_like_json_array(x: str) -> bool:
870+
return x.startswith("[") and x.endswith("]")
871+
872+
for col in df.columns:
873+
# Only parse object columns
874+
if df[col].dtype == "object":
875+
# skip null
876+
nonnull = df[col].dropna()
877+
if (
878+
len(nonnull) > 0
879+
and nonnull.apply(lambda x: isinstance(x, str) and looks_like_json_array(x)).all()
880+
):
881+
# parse
882+
df[col] = df[col].apply(lambda x: np.array(json.loads(x)) if pd.notnull(x) else x)
854883

855884

856885
@overload

scripts/tests/test_csv.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,60 @@
1-
# import pandas as pd
2-
# import numpy as np
1+
import pandas as pd
2+
print(pd.__file__)
3+
print(pd.__version__)
4+
5+
import numpy as np
6+
import os
37

48
# # Create a DataFrame with NumPy arrays
59
# df = pd.DataFrame({
610
# 'id': [1, 2],
711
# 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])]
812
# })
913

10-
# # Save to CSV
14+
# # Save to CSV (where your custom preserve_complex logic resides)
1115
# csv_file = "test_numpy_array.csv"
1216
# df.to_csv(csv_file, index=False, preserve_complex=True)
13-
# print(f"Saved CSV:\n{open(csv_file).read()}")
1417

15-
# # Read back the CSV
16-
# df_loaded = pd.read_csv(csv_file)
18+
# # Read back the raw CSV content (as text only)
19+
# with open(csv_file, "r") as f:
20+
# csv_content = f.read()
1721

18-
# # Print results
19-
# print("\nLoaded DataFrame:")
20-
# print(df_loaded)
22+
# print(f"Saved CSV:\n{csv_content}")
2123

22-
# # ✅ **Make the test fail by checking if we correctly load NumPy arrays**
24+
# # Simple test: check that our JSON-ified arrays are present in the CSV text
2325
# try:
24-
# assert isinstance(df_loaded["embedding"][0], np.ndarray), "Test Failed: Embeddings were not preserved as NumPy arrays!"
25-
# print("\nTest Passed: Embeddings were correctly preserved as NumPy arrays")
26-
# except AssertionError as e:
27-
# print("\nTest Failed: Pandas does not preserve NumPy arrays in CSV, needs improvement!")
28-
# raise e
26+
# assert "[0.1, 0.2, 0.3]" in csv_content
27+
# assert "[0.4, 0.5, 0.6]" in csv_content
28+
# print("\nTest Passed: The CSV output includes JSON-serialized arrays for 'embedding'.")
29+
# except AssertionError:
30+
# print("\nTest Failed: The CSV does not appear to have JSON-serialized arrays as expected!")
31+
# raise
2932

30-
import pandas as pd
31-
print(pd.__file__)
32-
print(pd.__version__)
3333

34-
import numpy as np
35-
import os
3634

35+
# TEST2
3736
# Create a DataFrame with NumPy arrays
3837
df = pd.DataFrame({
3938
'id': [1, 2],
4039
'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])]
4140
})
4241

43-
# Save to CSV (where your custom preserve_complex logic resides)
42+
# Save to CSV
4443
csv_file = "test_numpy_array.csv"
4544
df.to_csv(csv_file, index=False, preserve_complex=True)
45+
print(f"Saved CSV:\n{open(csv_file).read()}")
4646

47-
# Read back the raw CSV content (as text only)
48-
with open(csv_file, "r") as f:
49-
csv_content = f.read()
47+
# Read back the CSV
48+
df_loaded = pd.read_csv(csv_file, preserve_complex=True)
5049

51-
print(f"Saved CSV:\n{csv_content}")
50+
# Print results
51+
print("\nLoaded DataFrame:")
52+
print(df_loaded)
5253

53-
# Simple test: check that our JSON-ified arrays are present in the CSV text
54+
# ✅ **Make the test fail by checking if we correctly load NumPy arrays**
5455
try:
55-
assert "[0.1, 0.2, 0.3]" in csv_content
56-
assert "[0.4, 0.5, 0.6]" in csv_content
57-
print("\nTest Passed: The CSV output includes JSON-serialized arrays for 'embedding'.")
58-
except AssertionError:
59-
print("\nTest Failed: The CSV does not appear to have JSON-serialized arrays as expected!")
60-
raise
56+
assert isinstance(df_loaded["embedding"][0], np.ndarray), "Test Failed: Embeddings were not preserved as NumPy arrays!"
57+
print("\nTest Passed: Embeddings were correctly preserved as NumPy arrays")
58+
except AssertionError as e:
59+
print("\nTest Failed: Pandas does not preserve NumPy arrays in CSV, needs improvement!")
60+
raise e

0 commit comments

Comments
 (0)