66#include < Common/Arena.h>
77#include < Common/PODArray.h>
88#include < Core/UUID.h>
9+ #include < Core/Settings.h>
910
1011#include < Formats/FormatFactory.h>
1112#include < Processors/Formats/IOutputFormat.h>
@@ -30,6 +31,12 @@ namespace ErrorCodes
3031 extern const int INCORRECT_DATA;
3132}
3233
34+ namespace Setting
35+ {
36+ extern const SettingsNonZeroUInt64 delta_lake_insert_max_rows_in_data_file;
37+ extern const SettingsNonZeroUInt64 delta_lake_insert_max_bytes_in_data_file;
38+ }
39+
3340namespace
3441{
3542 // / Given partition columns list,
@@ -74,6 +81,8 @@ DeltaLakePartitionedSink::DeltaLakePartitionedSink(
7481 , object_storage(object_storage_)
7582 , format_settings(format_settings_)
7683 , configuration(configuration_)
84+ , data_file_max_rows(context_->getSettingsRef ()[Setting::delta_lake_insert_max_rows_in_data_file])
85+ , data_file_max_bytes(context_->getSettingsRef ()[Setting::delta_lake_insert_max_bytes_in_data_file])
7786 , partition_strategy(createPartitionStrategy(partition_columns, getHeader(), context_))
7887 , delta_transaction(delta_transaction_)
7988{
@@ -131,7 +140,7 @@ void DeltaLakePartitionedSink::consume(Chunk & chunk)
131140 partition_index_to_chunk.emplace_back (Columns (), partition_column->size ());
132141 }
133142
134- for (size_t partition_index = 0 ; partition_index < partitions_size; ++partition_index)
143+ for (size_t partition_index = 0 ; partition_index < partitions_size; ++partition_index)
135144 {
136145 auto & partition_chunk = partition_index_to_chunk[partition_index];
137146 partition_chunk.addColumn (std::move (partition_index_to_column_split[partition_index]));
@@ -140,68 +149,86 @@ void DeltaLakePartitionedSink::consume(Chunk & chunk)
140149
141150 for (const auto & [partition_key, partition_index] : partition_id_to_chunk_index)
142151 {
143- auto partition_data = getPartitionDataForPartitionKey (partition_key);
152+ auto & data_files = getPartitionDataForPartitionKey (partition_key)-> data_files ;
144153 auto & partition_chunk = partition_index_to_chunk[partition_index];
145- partition_data->sink ->consume (partition_chunk);
146- partition_data->size += partition_chunk.bytes ();
154+
155+ if (data_files.empty ()
156+ || data_files.back ().written_rows >= data_file_max_rows
157+ || data_files.back ().written_bytes >= data_file_max_bytes)
158+ {
159+ data_files.emplace_back (createSinkForPartition (partition_key));
160+ total_data_files_count += 1 ;
161+ }
162+ auto & data_file = data_files.back ();
163+ data_file.written_bytes += partition_chunk.bytes ();
164+ data_file.written_rows += partition_chunk.getNumRows ();
165+ data_file.sink ->consume (partition_chunk);
147166 }
148167}
149168
150- DeltaLakePartitionedSink::PartitionDataPtr
169+ DeltaLakePartitionedSink::PartitionInfoPtr
151170DeltaLakePartitionedSink::getPartitionDataForPartitionKey (StringRef partition_key)
152171{
153- auto it = partition_id_to_sink.find (partition_key);
154- if (it == partition_id_to_sink.end ())
155- {
156- auto data = std::make_shared<PartitionData>();
157- auto data_prefix = std::filesystem::path (delta_transaction->getDataPath ()) / partition_key.toString ();
158- data->path = DeltaLake::generateWritePath (std::move (data_prefix), configuration->format );
159-
160- data->sink = std::make_shared<StorageObjectStorageSink>(
161- data->path ,
162- object_storage,
163- configuration,
164- format_settings,
165- std::make_shared<Block>(partition_strategy->getFormatHeader ()),
166- getContext ()
167- );
168- std::tie (it, std::ignore) = partition_id_to_sink.emplace (partition_key, std::move (data));
169- }
172+ auto it = partitions_data.find (partition_key);
173+ if (it == partitions_data.end ())
174+ std::tie (it, std::ignore) = partitions_data.emplace (partition_key, std::make_shared<PartitionInfo>(partition_key));
170175 return it->second ;
171176}
172177
178+ DeltaLakePartitionedSink::StorageSinkPtr
179+ DeltaLakePartitionedSink::createSinkForPartition (StringRef partition_key)
180+ {
181+ auto data_prefix = std::filesystem::path (delta_transaction->getDataPath ()) / partition_key.toString ();
182+ return std::make_unique<StorageObjectStorageSink>(
183+ DeltaLake::generateWritePath (std::move (data_prefix), configuration->format ),
184+ object_storage,
185+ configuration,
186+ format_settings,
187+ std::make_shared<Block>(partition_strategy->getFormatHeader ()),
188+ getContext ());
189+ }
190+
173191void DeltaLakePartitionedSink::onFinish ()
174192{
175- if (isCancelled () || partition_id_to_sink .empty ())
193+ if (isCancelled () || partitions_data .empty ())
176194 return ;
177195
178- for (auto & [_, data] : partition_id_to_sink)
179- data->sink ->onFinish ();
180-
181- LOG_TEST (log, " Written to {} sinks" , partition_id_to_sink.size ());
196+ std::vector<DeltaLake::WriteTransaction::CommitFile> files;
197+ files.reserve (total_data_files_count);
198+ const auto data_prefix = delta_transaction->getDataPath ();
182199
183- try
200+ for ( auto & [_, partition_info] : partitions_data)
184201 {
185- std::vector<DeltaLake::WriteTransaction::CommitFile> files;
186- files.reserve (partition_id_to_sink.size ());
187- const auto data_prefix = delta_transaction->getDataPath ();
188- for (auto & [_, data] : partition_id_to_sink)
202+ auto & [partition_key, data_files] = *partition_info;
203+ auto partition_key_str = partition_key.toString ();
204+ auto keys_and_values = HivePartitioningUtils::parseHivePartitioningKeysAndValues (partition_key_str);
205+ Map partition_values;
206+ partition_values.reserve (keys_and_values.size ());
207+ for (const auto & [key, value] : keys_and_values)
208+ partition_values.emplace_back (DB::Tuple ({key, value}));
209+
210+ for (const auto & [sink, written_bytes, written_rows] : data_files)
189211 {
190- auto keys_and_values = HivePartitioningUtils::parseHivePartitioningKeysAndValues (data->path );
191- Map partition_values;
192- partition_values.reserve (keys_and_values.size ());
193- for (const auto & [key, value] : keys_and_values)
194- partition_values.emplace_back (DB::Tuple ({key, value}));
195-
196- files.emplace_back (data->path .substr (data_prefix.size ()), data->size , partition_values);
212+ sink->onFinish ();
213+ files.emplace_back (
214+ sink->getPath ().substr (data_prefix.size ()),
215+ sink->getFileSize (),
216+ partition_values);
197217 }
218+ }
219+
220+ LOG_TEST (log, " Written {} data files" , total_data_files_count);
221+
222+ try
223+ {
198224 delta_transaction->commit (files);
199225 }
200226 catch (...)
201227 {
202- for (auto & [_, data ] : partition_id_to_sink )
228+ for (auto & [_, partition_info ] : partitions_data )
203229 {
204- object_storage->removeObjectIfExists (StoredObject (data->path ));
230+ for (const auto & [sink, written_bytes, written_rows] : partition_info->data_files )
231+ object_storage->removeObjectIfExists (StoredObject (sink->getPath ()));
205232 }
206233 throw ;
207234 }
0 commit comments