Skip to content

Commit c1f266b

Browse files
committed
Revert "Merge pull request #1171 from Altinity/mkmkme/antalya-25.8-missing-field_id"
This reverts commit f5fb292, reversing changes made to 923825b.
1 parent c3c334f commit c1f266b

File tree

3 files changed

+4
-28
lines changed

3 files changed

+4
-28
lines changed

src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,7 @@ std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElem
133133
return element.name;
134134
const auto & map = column_mapper->getFieldIdToClickHouseName();
135135
if (!element.__isset.field_id)
136-
{
137-
/// Does iceberg require that parquet files have field ids?
138-
/// Our iceberg writer currently doesn't write them.
139-
//throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name);
140-
return element.name;
141-
}
136+
throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name);
142137
auto it = map.find(element.field_id);
143138
if (it == map.end())
144139
throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id);

src/Storages/ObjectStorage/StorageObjectStorage.cpp

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -203,26 +203,7 @@ StorageObjectStorage::StorageObjectStorage(
203203
sample_path);
204204
}
205205

206-
/// TODO: Known problems with datalake prewhere:
207-
/// * If the iceberg table went through schema evolution, columns read from file may need to
208-
/// be renamed or typecast before applying prewhere. There's already a mechanism for
209-
/// telling parquet reader to rename columns: ColumnMapper. And parquet reader already
210-
/// automatically does type casts to requested types. But weirdly the iceberg reader uses
211-
/// those mechanism to request the *old* name and type of the column, then has additional
212-
/// code to do the renaming and casting as a separate step outside parquet reader.
213-
/// We should probably change this and delete that additional code?
214-
/// * Delta Lake can have "partition columns", which are columns with constant value specified
215-
/// in the metadata, not present in parquet file. Like hive partitioning, but in metadata
216-
/// files instead of path. Currently these columns are added to the block outside parquet
217-
/// reader. If they appear in prewhere expression, parquet reader gets a "no column in block"
218-
/// error. Unlike hive partitioning, we can't (?) just return these columns from
219-
/// supportedPrewhereColumns() because at the time of the call the delta lake metadata hasn't
220-
/// been read yet. So we should probably pass these columns to the parquet reader instead of
221-
/// adding them outside.
222-
/// * There's a bug in StorageObjectStorageSource::createReader: it makes a copy of
223-
/// FormatFilterInfo, but for some reason unsets prewhere_info and row_level_filter_info.
224-
/// There's probably no reason for this, and it should just copy those fields like the others.
225-
supports_prewhere = !configuration->isDataLakeConfiguration() && FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings);
206+
supports_prewhere = FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings);
226207

227208
StorageInMemoryMetadata metadata;
228209
metadata.setColumns(columns);
@@ -726,3 +707,4 @@ void StorageObjectStorage::checkAlterIsPossible(const AlterCommands & commands,
726707
}
727708

728709
}
710+

tests/integration/test_storage_iceberg/test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -875,8 +875,7 @@ def test_position_deletes_out_of_order(started_cluster, use_roaring_bitmaps):
875875

876876
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, additional_settings=["input_format_parquet_use_native_reader_v3=1", f"use_roaring_bitmap_iceberg_positional_deletes={use_roaring_bitmaps}"])
877877

878-
# TODO: Replace WHERE with PREWHERE when we add prewhere support for datalakes.
879-
assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} WHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104]
878+
assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} PREWHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104]
880879

881880
instance.query(f"DROP TABLE {TABLE_NAME}")
882881

0 commit comments

Comments
 (0)