GH-47861: [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860)

Pear0 · pitrou · web-flow · commit 57f626189326 · 2025-10-28T11:03:26.000+01:00
### Rationale for this change See GH-47861. With this change, the extension array variation takes ~192MB of memory instead of 7GB. From what I can tell, this is because the `PandasOptions` struct is copied around frequently (for example it seems like there is an `ExtensionWriter` for each extension column and each `ExtensionWriter` has a copy of `PandasOptions` which has a set of all extension columns). I haven't fully traced the PandasOptions structure, but it seems to get copied and modified in some codepaths so I have decided to put the column sets into a `std::shared_ptr` rather than pass around a `shared_ptr<PandasOptions>`. ### What changes are included in this PR? The `PandasOptions` column sets have been swapped from `std::unordered_set<std::string>` to `std::shared_ptr<const std::unordered_set<std::string>>` and usages have been updated. ### Are these changes tested? Yes, no regression in the pytests. Also tested memory usage by hand. ### Are there any user-facing changes? All changes are internal to the pyarrow C++ binding code. There are no changes to the exposed Python API. * GitHub Issue: #47861 Lead-authored-by: Will Gulian <williamgulian@gmail.com> Co-authored-by: Will Gulian <git@will.gl> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
@@ -198,8 +198,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool self_destruct
         MapConversionType maps_as_pydicts
         c_bool decode_dictionaries
-        unordered_set[c_string] categorical_columns
-        unordered_set[c_string] extension_columns
+        shared_ptr[const unordered_set[c_string]] categorical_columns
+        shared_ptr[const unordered_set[c_string]] extension_columns
         c_bool to_numpy
 
 
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -75,7 +75,7 @@ PandasOptions MakeInnerOptions(PandasOptions options) {
   // Make sure conversion of inner dictionary arrays always returns an array,
   // not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
   options.decode_dictionaries = true;
-  options.categorical_columns.clear();
+  options.categorical_columns.reset();
   options.strings_to_categorical = false;
 
   // In ARROW-7723, we found as a result of ARROW-3789 that second
@@ -2337,7 +2337,7 @@ class ConsolidatedBlockCreator : public PandasBlockCreator {
   }
 
   Status GetBlockType(int column_index, PandasWriter::type* out) {
-    if (options_.extension_columns.count(fields_[column_index]->name())) {
+    if (options_.IsExtensionColumn(fields_[column_index]->name())) {
       *out = PandasWriter::EXTENSION;
       return Status::OK();
     } else {
@@ -2458,7 +2458,7 @@ class SplitBlockCreator : public PandasBlockCreator {
   Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
     PandasWriter::type output_type = PandasWriter::OBJECT;
     const DataType& type = *arrays_[i]->type();
-    if (options_.extension_columns.count(fields_[i]->name())) {
+    if (options_.IsExtensionColumn(fields_[i]->name())) {
       output_type = PandasWriter::EXTENSION;
     } else {
       // Null count needed to determine output type
@@ -2516,10 +2516,10 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
     return Status::OK();
   };
 
-  if (!options.categorical_columns.empty()) {
+  if (options.HasCategoricalColumns()) {
     for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
       if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
-          options.categorical_columns.count((*fields)[i]->name())) {
+          options.IsCategoricalColumn((*fields)[i]->name())) {
         columns_to_encode.push_back(i);
       }
     }
@@ -2625,7 +2625,7 @@ Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table>
 
   PandasOptions modified_options = options;
   modified_options.strings_to_categorical = false;
-  modified_options.categorical_columns.clear();
+  modified_options.categorical_columns.reset();
 
   if (options.split_blocks) {
     modified_options.allow_zero_copy_blocks = true;
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -49,6 +49,22 @@ enum class MapConversionType {
 };
 
 struct PandasOptions {
+  bool HasCategoricalColumns() const {
+    return categorical_columns && !categorical_columns->empty();
+  }
+
+  bool IsCategoricalColumn(const std::string& name) const {
+    return categorical_columns && categorical_columns->count(name);
+  }
+
+  bool HasExtensionColumns() const {
+    return extension_columns && !extension_columns->empty();
+  }
+
+  bool IsExtensionColumn(const std::string& name) const {
+    return extension_columns && extension_columns->count(name);
+  }
+
   /// arrow::MemoryPool to use for memory allocations
   MemoryPool* pool = default_memory_pool();
 
@@ -112,11 +128,14 @@ struct PandasOptions {
   bool decode_dictionaries = false;
 
   // Columns that should be casted to categorical
-  std::unordered_set<std::string> categorical_columns;
+  //
+  // This is wrapped in a shared_ptr because this struct is copied internally for
+  // each column or nested field (see GH-47861).
+  std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
 
   // Columns that should be passed through to be converted to
   // ExtensionArray/Block
-  std::unordered_set<std::string> extension_columns;
+  std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
 
   // Used internally to decipher between to_numpy() and to_pandas() when
   // the expected output differs
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -4083,10 +4083,11 @@ def table_to_blocks(options, Table table, categories, extension_columns):
         PandasOptions c_options = _convert_pandas_options(options)
 
     if categories is not None:
-        c_options.categorical_columns = {tobytes(cat) for cat in categories}
+        c_options.categorical_columns = make_shared[unordered_set[c_string]](
+            unordered_set[c_string]({tobytes(cat) for cat in categories}))
     if extension_columns is not None:
-        c_options.extension_columns = {tobytes(col)
-                                       for col in extension_columns}
+        c_options.extension_columns = make_shared[unordered_set[c_string]](
+            unordered_set[c_string]({tobytes(col) for col in extension_columns}))
 
     if pandas_api.is_v1():
         # ARROW-3789: Coerce date/timestamp types to datetime64[ns]