Passing test using both to_csv and read_csv

Jaspvr · Jaspvr · commit 2cac1bb5eaa7 · 2025-03-28T00:45:03.000-07:00
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -25,6 +25,8 @@
 import warnings
 
 import numpy as np
+import pandas as pd
+import json
 
 from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
@@ -831,6 +833,7 @@ def read_csv(
     memory_map: bool = False,
     float_precision: Literal["high", "legacy", "round_trip"] | None = None,
     storage_options: StorageOptions | None = None,
+    preserve_complex: bool = False,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 ) -> DataFrame | TextFileReader:
     # locals() should never be modified
@@ -850,7 +853,33 @@ def read_csv(
     )
     kwds.update(kwds_defaults)
 
-    return _read(filepath_or_buffer, kwds)
+    df_or_reader = _read(filepath_or_buffer, kwds)
+    # If DataFrame, parse columns containing JSON arrays if preserve_complex=True
+    if preserve_complex and isinstance(df_or_reader, DataFrame):
+        _restore_complex_arrays(df_or_reader)
+
+    return df_or_reader
+
+
+def _restore_complex_arrays(df: DataFrame) -> None:
+    """
+    Loop over each column of df, check if it contains bracketed JSON strings
+    like "[0.1, 0.2, 0.3]", and parse them back into NumPy arrays.
+    """
+    def looks_like_json_array(x: str) -> bool:
+        return x.startswith("[") and x.endswith("]")
+
+    for col in df.columns:
+        # Only parse object columns
+        if df[col].dtype == "object":
+            # skip null
+            nonnull = df[col].dropna()
+            if (
+                len(nonnull) > 0
+                and nonnull.apply(lambda x: isinstance(x, str) and looks_like_json_array(x)).all()
+            ):
+                # parse
+                df[col] = df[col].apply(lambda x: np.array(json.loads(x)) if pd.notnull(x) else x)
 
 
 @overload
diff --git a/scripts/tests/test_csv.py b/scripts/tests/test_csv.py
@@ -1,60 +1,60 @@
-# import pandas as pd
-# import numpy as np
+import pandas as pd
+print(pd.__file__)
+print(pd.__version__)
+
+import numpy as np
+import os
 
 # # Create a DataFrame with NumPy arrays
 # df = pd.DataFrame({
 #     'id': [1, 2],
 #     'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])]
 # })
 
-# # Save to CSV
+# # Save to CSV (where your custom preserve_complex logic resides)
 # csv_file = "test_numpy_array.csv"
 # df.to_csv(csv_file, index=False, preserve_complex=True)
-# print(f"Saved CSV:\n{open(csv_file).read()}")
 
-# # Read back the CSV
-# df_loaded = pd.read_csv(csv_file)
+# # Read back the raw CSV content (as text only)
+# with open(csv_file, "r") as f:
+#     csv_content = f.read()
 
-# # Print results
-# print("\nLoaded DataFrame:")
-# print(df_loaded)
+# print(f"Saved CSV:\n{csv_content}")
 
-# # ✅ **Make the test fail by checking if we correctly load NumPy arrays**
+# # Simple test: check that our JSON-ified arrays are present in the CSV text
 # try:
-#     assert isinstance(df_loaded["embedding"][0], np.ndarray), "Test Failed: Embeddings were not preserved as NumPy arrays!"
-#     print("\nTest Passed: Embeddings were correctly preserved as NumPy arrays")
-# except AssertionError as e:
-#     print("\nTest Failed: Pandas does not preserve NumPy arrays in CSV, needs improvement!")
-#     raise e
+#     assert "[0.1, 0.2, 0.3]" in csv_content
+#     assert "[0.4, 0.5, 0.6]" in csv_content
+#     print("\nTest Passed: The CSV output includes JSON-serialized arrays for 'embedding'.")
+# except AssertionError:
+#     print("\nTest Failed: The CSV does not appear to have JSON-serialized arrays as expected!")
+#     raise
 
-import pandas as pd
-print(pd.__file__)
-print(pd.__version__)
 
-import numpy as np
-import os
 
+# TEST2
 # Create a DataFrame with NumPy arrays
 df = pd.DataFrame({
     'id': [1, 2],
     'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])]
 })
 
-# Save to CSV (where your custom preserve_complex logic resides)
+# Save to CSV
 csv_file = "test_numpy_array.csv"
 df.to_csv(csv_file, index=False, preserve_complex=True)
+print(f"Saved CSV:\n{open(csv_file).read()}")
 
-# Read back the raw CSV content (as text only)
-with open(csv_file, "r") as f:
-    csv_content = f.read()
+# Read back the CSV
+df_loaded = pd.read_csv(csv_file, preserve_complex=True)
 
-print(f"Saved CSV:\n{csv_content}")
+# Print results
+print("\nLoaded DataFrame:")
+print(df_loaded)
 
-# Simple test: check that our JSON-ified arrays are present in the CSV text
+# ✅ **Make the test fail by checking if we correctly load NumPy arrays**
 try:
-    assert "[0.1, 0.2, 0.3]" in csv_content
-    assert "[0.4, 0.5, 0.6]" in csv_content
-    print("\nTest Passed: The CSV output includes JSON-serialized arrays for 'embedding'.")
-except AssertionError:
-    print("\nTest Failed: The CSV does not appear to have JSON-serialized arrays as expected!")
-    raise
+    assert isinstance(df_loaded["embedding"][0], np.ndarray), "Test Failed: Embeddings were not preserved as NumPy arrays!"
+    print("\nTest Passed: Embeddings were correctly preserved as NumPy arrays")
+except AssertionError as e:
+    print("\nTest Failed: Pandas does not preserve NumPy arrays in CSV, needs improvement!")
+    raise e