Skip to content

Commit 57f6261

Browse files
Pear0pitrou
andauthored
GH-47861: [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860)
### Rationale for this change See GH-47861. With this change, the extension array variation takes ~192MB of memory instead of 7GB. From what I can tell, this is because the `PandasOptions` struct is copied around frequently (for example it seems like there is an `ExtensionWriter` for each extension column and each `ExtensionWriter` has a copy of `PandasOptions` which has a set of all extension columns). I haven't fully traced the PandasOptions structure, but it seems to get copied and modified in some codepaths so I have decided to put the column sets into a `std::shared_ptr` rather than pass around a `shared_ptr<PandasOptions>`. ### What changes are included in this PR? The `PandasOptions` column sets have been swapped from `std::unordered_set<std::string>` to `std::shared_ptr<const std::unordered_set<std::string>>` and usages have been updated. ### Are these changes tested? Yes, no regression in the pytests. Also tested memory usage by hand. ### Are there any user-facing changes? All changes are internal to the pyarrow C++ binding code. There are no changes to the exposed Python API. * GitHub Issue: #47861 Lead-authored-by: Will Gulian <[email protected]> Co-authored-by: Will Gulian <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 88179b6 commit 57f6261

File tree

4 files changed

+33
-13
lines changed

4 files changed

+33
-13
lines changed

python/pyarrow/includes/libarrow_python.pxd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
198198
c_bool self_destruct
199199
MapConversionType maps_as_pydicts
200200
c_bool decode_dictionaries
201-
unordered_set[c_string] categorical_columns
202-
unordered_set[c_string] extension_columns
201+
shared_ptr[const unordered_set[c_string]] categorical_columns
202+
shared_ptr[const unordered_set[c_string]] extension_columns
203203
c_bool to_numpy
204204

205205

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ PandasOptions MakeInnerOptions(PandasOptions options) {
7575
// Make sure conversion of inner dictionary arrays always returns an array,
7676
// not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
7777
options.decode_dictionaries = true;
78-
options.categorical_columns.clear();
78+
options.categorical_columns.reset();
7979
options.strings_to_categorical = false;
8080

8181
// In ARROW-7723, we found as a result of ARROW-3789 that second
@@ -2337,7 +2337,7 @@ class ConsolidatedBlockCreator : public PandasBlockCreator {
23372337
}
23382338

23392339
Status GetBlockType(int column_index, PandasWriter::type* out) {
2340-
if (options_.extension_columns.count(fields_[column_index]->name())) {
2340+
if (options_.IsExtensionColumn(fields_[column_index]->name())) {
23412341
*out = PandasWriter::EXTENSION;
23422342
return Status::OK();
23432343
} else {
@@ -2458,7 +2458,7 @@ class SplitBlockCreator : public PandasBlockCreator {
24582458
Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
24592459
PandasWriter::type output_type = PandasWriter::OBJECT;
24602460
const DataType& type = *arrays_[i]->type();
2461-
if (options_.extension_columns.count(fields_[i]->name())) {
2461+
if (options_.IsExtensionColumn(fields_[i]->name())) {
24622462
output_type = PandasWriter::EXTENSION;
24632463
} else {
24642464
// Null count needed to determine output type
@@ -2516,10 +2516,10 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
25162516
return Status::OK();
25172517
};
25182518

2519-
if (!options.categorical_columns.empty()) {
2519+
if (options.HasCategoricalColumns()) {
25202520
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
25212521
if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
2522-
options.categorical_columns.count((*fields)[i]->name())) {
2522+
options.IsCategoricalColumn((*fields)[i]->name())) {
25232523
columns_to_encode.push_back(i);
25242524
}
25252525
}
@@ -2625,7 +2625,7 @@ Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table>
26252625

26262626
PandasOptions modified_options = options;
26272627
modified_options.strings_to_categorical = false;
2628-
modified_options.categorical_columns.clear();
2628+
modified_options.categorical_columns.reset();
26292629

26302630
if (options.split_blocks) {
26312631
modified_options.allow_zero_copy_blocks = true;

python/pyarrow/src/arrow/python/arrow_to_pandas.h

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,22 @@ enum class MapConversionType {
4949
};
5050

5151
struct PandasOptions {
52+
bool HasCategoricalColumns() const {
53+
return categorical_columns && !categorical_columns->empty();
54+
}
55+
56+
bool IsCategoricalColumn(const std::string& name) const {
57+
return categorical_columns && categorical_columns->count(name);
58+
}
59+
60+
bool HasExtensionColumns() const {
61+
return extension_columns && !extension_columns->empty();
62+
}
63+
64+
bool IsExtensionColumn(const std::string& name) const {
65+
return extension_columns && extension_columns->count(name);
66+
}
67+
5268
/// arrow::MemoryPool to use for memory allocations
5369
MemoryPool* pool = default_memory_pool();
5470

@@ -112,11 +128,14 @@ struct PandasOptions {
112128
bool decode_dictionaries = false;
113129

114130
// Columns that should be casted to categorical
115-
std::unordered_set<std::string> categorical_columns;
131+
//
132+
// This is wrapped in a shared_ptr because this struct is copied internally for
133+
// each column or nested field (see GH-47861).
134+
std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
116135

117136
// Columns that should be passed through to be converted to
118137
// ExtensionArray/Block
119-
std::unordered_set<std::string> extension_columns;
138+
std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
120139

121140
// Used internally to decipher between to_numpy() and to_pandas() when
122141
// the expected output differs

python/pyarrow/table.pxi

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4083,10 +4083,11 @@ def table_to_blocks(options, Table table, categories, extension_columns):
40834083
PandasOptions c_options = _convert_pandas_options(options)
40844084

40854085
if categories is not None:
4086-
c_options.categorical_columns = {tobytes(cat) for cat in categories}
4086+
c_options.categorical_columns = make_shared[unordered_set[c_string]](
4087+
unordered_set[c_string]({tobytes(cat) for cat in categories}))
40874088
if extension_columns is not None:
4088-
c_options.extension_columns = {tobytes(col)
4089-
for col in extension_columns}
4089+
c_options.extension_columns = make_shared[unordered_set[c_string]](
4090+
unordered_set[c_string]({tobytes(col) for col in extension_columns}))
40904091

40914092
if pandas_api.is_v1():
40924093
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]

0 commit comments

Comments
 (0)