From bbc924d317daadd4ede57ea4445a2cffd70b1b7f Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 11 Dec 2025 16:04:14 +0300
Subject: [PATCH 01/46] WIP: Use return_stats option to collect column
 statistics

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_copy/src/copy/copy.c                  |   2 +-
 .../pg_lake/data_file/data_file_stats.h       |   4 +
 .../include/pg_lake/parquet/field.h           |   1 +
 .../include/pg_lake/pgduck/delete_data.h      |   5 +-
 .../include/pg_lake/pgduck/write_data.h       |  13 +-
 pg_lake_engine/src/data_file/data_files.c     |  57 +++
 pg_lake_engine/src/parquet/field.c            |   3 +-
 pg_lake_engine/src/pgduck/delete_data.c       |  28 +-
 pg_lake_engine/src/pgduck/write_data.c        | 357 +++++++++++++++++-
 .../include/pg_lake/fdw/data_file_stats.h     |   1 +
 .../include/pg_lake/fdw/writable_table.h      |   1 +
 pg_lake_table/src/fdw/data_file_stats.c       |  24 ++
 pg_lake_table/src/fdw/multi_data_file_dest.c  |   4 +
 pg_lake_table/src/fdw/writable_table.c        | 108 +++---
 14 files changed, 529 insertions(+), 79 deletions(-)

diff --git a/pg_lake_copy/src/copy/copy.c b/pg_lake_copy/src/copy/copy.c
index e25f8be5..6e433339 100644
--- a/pg_lake_copy/src/copy/copy.c
+++ b/pg_lake_copy/src/copy/copy.c
@@ -916,7 +916,7 @@ ProcessPgLakeCopyTo(CopyStmt *copyStmt, ParseState *pstate, Relation relation,
 	 */
 	ConvertCSVFileTo(tempCSVPath, tupleDesc, maximumLineLength,
 					 destinationPath, destinationFormat, destinationCompression,
-					 copyStmt->options, schema);
+					 copyStmt->options, schema, NULL, NULL);
 
 	if (IsCopyToStdout(copyStmt))
 	{
diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index 3021a504..355e58a9 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -43,6 +43,8 @@ typedef struct DataFileColumnStats
   */
 typedef struct DataFileStats
 {
+	char	   *dataFilePath;
+
 	/* number of bytes in the file */
 	int64		fileSize;
 
@@ -61,3 +63,5 @@ typedef struct DataFileStats
 	/* for a new data file with row IDs, the start of the range */
 	int64		rowIdStart;
 }			DataFileStats;
+
+extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * stats);
diff --git a/pg_lake_engine/include/pg_lake/parquet/field.h b/pg_lake_engine/include/pg_lake/parquet/field.h
index 7cb84c92..ee23b47f 100644
--- a/pg_lake_engine/include/pg_lake/parquet/field.h
+++ b/pg_lake_engine/include/pg_lake/parquet/field.h
@@ -154,3 +154,4 @@ typedef FieldStruct DataFileSchema;
 typedef FieldStructElement DataFileSchemaField;
 
 extern PGDLLEXPORT DataFileSchema * DeepCopyDataFileSchema(const DataFileSchema * schema);
+extern PGDLLEXPORT Field * DeepCopyField(const Field * field);
diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index 26101d8d..9ec6b834 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -22,6 +22,7 @@
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/pgduck/read_data.h"
+#include "pg_lake/data_file/data_file_stats.h"
 
 extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
 												 List *positionDeleteFiles,
@@ -29,4 +30,6 @@ extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
 												 char *destinationPath,
 												 CopyDataCompression destinationCompression,
 												 DataFileSchema * schema,
-												 ReadDataStats * stats);
+												 ReadDataStats * stats,
+												 List *leafFields,
+												 DataFileStats * *newFileStats);
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index bcfd8389..6a30e55b 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "access/tupdesc.h"
+#include "libpq-fe.h"
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/parquet/field.h"
 #include "nodes/pg_list.h"
@@ -42,7 +43,9 @@ extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
 										 CopyDataFormat destinationFormat,
 										 CopyDataCompression destinationCompression,
 										 List *formatOptions,
-										 DataFileSchema * schema);
+										 DataFileSchema * schema,
+										 List *leafFields,
+										 List **dataFileStats);
 extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
 											char *destinationPath,
 											CopyDataFormat destinationFormat,
@@ -50,5 +53,11 @@ extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
 											List *formatOptions,
 											bool queryHasRowId,
 											DataFileSchema * schema,
-											TupleDesc queryTupleDesc);
+											TupleDesc queryTupleDesc,
+											List *leafFields,
+											List **dataFileStats);
 extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
+extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
+														  List *leafFields,
+														  DataFileSchema * schema,
+														  int *totalRowCount);
diff --git a/pg_lake_engine/src/data_file/data_files.c b/pg_lake_engine/src/data_file/data_files.c
index 03011b38..ca4f4505 100644
--- a/pg_lake_engine/src/data_file/data_files.c
+++ b/pg_lake_engine/src/data_file/data_files.c
@@ -19,6 +19,8 @@
 #include "pg_lake/data_file/data_files.h"
 #include "pg_lake/util/string_utils.h"
 
+static LeafField DeepCopyLeafField(const LeafField * leafField);
+
 /*
  * AddDataFileOperation creates a TableMetadataOperation for adding a new data
  * file.
@@ -110,3 +112,58 @@ AddRowIdMappingOperation(const char *dataFilePath, List *rowIdRanges)
 
 	return operation;
 }
+
+/*
+ * DeepCopyDataFileStats deep copies a DataFileSchema.
+ */
+DataFileStats *
+DeepCopyDataFileStats(const DataFileStats * stats)
+{
+	DataFileStats *copiedStats = palloc0(sizeof(DataFileStats));
+
+	copiedStats->dataFilePath = pstrdup(stats->dataFilePath);
+	copiedStats->fileSize = stats->fileSize;
+	copiedStats->rowCount = stats->rowCount;
+	copiedStats->deletedRowCount = stats->deletedRowCount;
+	copiedStats->creationTime = stats->creationTime;
+	copiedStats->rowIdStart = stats->rowIdStart;
+
+	/* Deep copy column stats list */
+	if (stats->columnStats != NULL)
+	{
+		copiedStats->columnStats = NIL;
+		ListCell   *cell = NULL;
+
+		foreach(cell, stats->columnStats)
+		{
+			DataFileColumnStats *colStats = lfirst(cell);
+			DataFileColumnStats *copiedColStats = palloc0(sizeof(DataFileColumnStats));
+
+			copiedColStats->leafField = DeepCopyLeafField(&colStats->leafField);
+			copiedColStats->lowerBoundText = pstrdup(colStats->lowerBoundText);
+			copiedColStats->upperBoundText = pstrdup(colStats->upperBoundText);
+
+			copiedStats->columnStats = lappend(copiedStats->columnStats, copiedColStats);
+		}
+	}
+
+	return copiedStats;
+}
+
+
+/*
+ * DeepCopyLeafField deep copies a LeafField.
+ */
+static LeafField
+DeepCopyLeafField(const LeafField * leafField)
+{
+	LeafField  *copiedLeafField = palloc0(sizeof(LeafField));
+
+	copiedLeafField->fieldId = leafField->fieldId;
+	copiedLeafField->field = DeepCopyField(leafField->field);
+	copiedLeafField->duckTypeName = pstrdup(leafField->duckTypeName);
+	copiedLeafField->level = leafField->level;
+	copiedLeafField->pgType = leafField->pgType;
+
+	return *copiedLeafField;
+}
diff --git a/pg_lake_engine/src/parquet/field.c b/pg_lake_engine/src/parquet/field.c
index b51e4dc0..b377f430 100644
--- a/pg_lake_engine/src/parquet/field.c
+++ b/pg_lake_engine/src/parquet/field.c
@@ -24,12 +24,11 @@
 #include "pg_lake/util/string_utils.h"
 
 static FieldStructElement * DeepCopyFieldStructElement(FieldStructElement * structElementField);
-static Field * DeepCopyField(const Field * field);
 
 /*
  * DeepCopyField deep copies a Field.
  */
-static Field *
+Field *
 DeepCopyField(const Field * field)
 {
 	Field	   *fieldCopy = palloc0(sizeof(Field));
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index da9397ed..4d43ffb8 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -54,7 +54,9 @@ PerformDeleteFromParquet(char *sourcePath,
 						 char *destinationPath,
 						 CopyDataCompression destinationCompression,
 						 DataFileSchema * schema,
-						 ReadDataStats * stats)
+						 ReadDataStats * stats,
+						 List *leafFields,
+						 DataFileStats * *newFileStats)
 {
 	const char *remainderQuery =
 		DeleteFromParquetQuery(sourcePath, positionDeleteFiles, deletionFilePath, schema, stats);
@@ -91,10 +93,32 @@ PerformDeleteFromParquet(char *sourcePath,
 		appendStringInfoString(&command, "}");
 	}
 
+	appendStringInfoString(&command, ", return_stats");
+
 	/* end WITH options */
 	appendStringInfoString(&command, ")");
 
-	ExecuteCommandInPGDuck(command.data);
+	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
+
+	PG_TRY();
+	{
+		PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
+
+		CheckPGDuckResult(pgDuckConn, result);
+
+		int rowsAffected;
+		List	   *dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
+
+		Assert(dataFileStats != NIL);
+		*newFileStats = DeepCopyDataFileStats((DataFileStats *) linitial(dataFileStats));
+
+		PQclear(result);
+	}
+	PG_FINALLY();
+	{
+		ReleasePGDuckConnection(pgDuckConn);
+	}
+	PG_END_TRY();
 }
 
 
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index a0908c82..d42ef2eb 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -25,6 +25,7 @@
 #include "common/string.h"
 #include "pg_lake/csv/csv_options.h"
 #include "pg_lake/copy/copy_format.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/extensions/postgis.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/parquet/geoparquet.h"
@@ -47,6 +48,8 @@ static DuckDBTypeInfo ChooseDuckDBEngineTypeForWrite(PGType postgresType,
 													 CopyDataFormat destinationFormat);
 static void AppendFieldIdValue(StringInfo map, Field * field, int fieldId);
 static const char *ParquetVersionToString(ParquetVersion version);
+static void ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs);
+static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
 {
@@ -69,7 +72,9 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 				 CopyDataFormat destinationFormat,
 				 CopyDataCompression destinationCompression,
 				 List *formatOptions,
-				 DataFileSchema * schema)
+				 DataFileSchema * schema,
+				 List *leafFields,
+				 List **dataFileStats)
 {
 	StringInfoData command;
 
@@ -139,7 +144,9 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 					   formatOptions,
 					   queryHasRowIds,
 					   schema,
-					   csvTupleDesc);
+					   csvTupleDesc,
+					   leafFields,
+					   dataFileStats);
 }
 
 
@@ -156,9 +163,12 @@ WriteQueryResultTo(char *query,
 				   List *formatOptions,
 				   bool queryHasRowId,
 				   DataFileSchema * schema,
-				   TupleDesc queryTupleDesc)
+				   TupleDesc queryTupleDesc,
+				   List *leafFields,
+				   List **dataFileStats)
 {
 	StringInfoData command;
+	bool		useReturnStats = false;
 
 	initStringInfo(&command);
 
@@ -253,6 +263,9 @@ WriteQueryResultTo(char *query,
 				appendStringInfo(&command, ", parquet_version '%s'",
 								 ParquetVersionToString(DefaultParquetVersion));
 
+				appendStringInfo(&command, ", return_stats");
+				useReturnStats = true;
+
 				break;
 			}
 
@@ -386,27 +399,335 @@ WriteQueryResultTo(char *query,
 	/* end WITH options */
 	appendStringInfoString(&command, ")");
 
-	if (TargetRowGroupSizeMB > 0)
+	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
+	int64		rowsAffected = -1;
+	PGresult   *result;
+	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
+
+	PG_TRY();
 	{
-		/*
-		 * preserve_insertion_order=false reduces memory consumption during
-		 * COPY <query> TO when an explicit ORDER BY not specified in the
-		 * query. It is helpful for csv and json formats as well but for
-		 * simplicity we use the same setting TargetRowGroupSizeMB for all
-		 * formats.
-		 */
-		List	   *commands = list_make3("SET preserve_insertion_order TO 'false';",
-										  command.data,
-										  "RESET preserve_insertion_order;");
+		if (disablePreserveInsertionOrder)
+		{
+			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "SET preserve_insertion_order TO 'false';");
+			CheckPGDuckResult(pgDuckConn, result);
+			PQclear(result);
+		}
 
-		List	   *rowsAffected = ExecuteCommandsInPGDuck(commands);
+		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
+		CheckPGDuckResult(pgDuckConn, result);
+
+		if (useReturnStats && dataFileStats != NULL)
+		{
+			/* DuckDB returns COPY 0 when return_stats is used. */
+			*dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
+		}
+		else
+		{
+			char	   *commandTuples = PQcmdTuples(result);
 
-		return list_nth_int(rowsAffected, 1);
+			rowsAffected = atol(commandTuples);
+		}
+
+		PQclear(result);
+
+		if (disablePreserveInsertionOrder)
+		{
+			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "RESET preserve_insertion_order;");
+			CheckPGDuckResult(pgDuckConn, result);
+			PQclear(result);
+		}
 	}
-	else
+	PG_FINALLY();
 	{
-		return ExecuteCommandInPGDuck(command.data);
+		ReleasePGDuckConnection(pgDuckConn);
 	}
+	PG_END_TRY();
+
+	return rowsAffected;
+}
+
+
+List *
+GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema, int *totalRowCount)
+{
+	List	   *statsList = NIL;
+
+	int			rowCount = PQntuples(result);
+	int			columnCount = PQnfields(result);
+	*totalRowCount = 0;
+
+	for (int r = 0; r < rowCount; r++)
+	{
+		DataFileStats *fileStats = palloc0(sizeof(DataFileStats));
+
+		for (int c = 0; c < columnCount; c++)
+		{
+			char	   *colName = PQfname(result, c);
+			char	   *val = PQgetvalue(result, r, c);
+
+			if (strcmp(colName, "column_statistics") == 0)
+			{
+				List	   *names = NIL;
+				List	   *mins = NIL;
+				List	   *maxs = NIL;
+
+				ParseDuckdbColumnMinMaxFromText(val, &names, &mins, &maxs);
+				fileStats->columnStats = GetDataFileColumnStatsList(names, mins, maxs, leafFields, schema);
+			}
+			else if (strcmp(colName, "file_size_bytes") == 0)
+			{
+				fileStats->fileSize = atoll(val);
+			}
+			else if (strcmp(colName, "count") == 0)
+			{
+				fileStats->rowCount = atoll(val);
+				*totalRowCount += fileStats->rowCount;
+			}
+			else if (strcmp(colName, "filename") == 0)
+			{
+				fileStats->dataFilePath = pstrdup(val);
+			}
+		}
+
+		statsList = lappend(statsList, fileStats);
+	}
+
+	return statsList;
+}
+
+
+static void
+ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs)
+{
+	input = PgLakeReplaceText(pstrdup(input), "\"", "");
+	input = PgLakeReplaceText(pstrdup(input), "\\", "");
+
+	char	   *ptr = (char *) input + 1;
+
+	while (*ptr != '\0')
+	{
+		/* skip whitespace and commas */
+		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t' || *ptr == ',')
+		{
+			ptr++;
+		}
+
+		if (*ptr == '\0' || *ptr == '}')
+		{
+			break;
+		}
+
+		if (*ptr != '(')
+		{
+			ereport(ERROR,
+					(errmsg("invalid duckdb column min/max format: expected '(' at position %ld", ptr - input)));
+		}
+
+		ptr++;
+
+		/* parse column name */
+		char	   *nameStart = ptr;
+
+		while (*ptr != ',' && *ptr != '\0')
+		{
+			ptr++;
+		}
+
+		if (*ptr == '\0')
+		{
+			ereport(ERROR,
+					(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing column name")));
+		}
+
+		size_t		nameLen = ptr - nameStart;
+
+		char	   *columnName = pnstrdup(nameStart, nameLen);
+
+		ptr++;
+
+		/* skip whitespace */
+		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t')
+		{
+			ptr++;
+		}
+
+		if (*ptr != '{')
+		{
+			ereport(ERROR,
+					(errmsg("invalid duckdb column min/max format: expected '{' at position %ld", ptr - input)));
+		}
+
+		ptr++;
+
+		char	   *minValue = NULL;
+		char	   *maxValue = NULL;
+
+		/* parse key-value pairs inside the braces */
+		while (*ptr != '}' && *ptr != '\0')
+		{
+			/* skip whitespace and commas */
+			while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t' || *ptr == ',')
+			{
+				ptr++;
+			}
+
+			if (*ptr == '}')
+			{
+				break;
+			}
+
+			if (*ptr != '(')
+			{
+				ereport(ERROR,
+						(errmsg("invalid duckdb column min/max format: expected '(' at position %ld", ptr - input)));
+			}
+
+			ptr++;
+
+			/* parse key */
+			char	   *keyStart = ptr;
+
+			while (*ptr != ',' && *ptr != '\0')
+			{
+				ptr++;
+			}
+
+			if (*ptr == '\0')
+			{
+				ereport(ERROR,
+						(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing key")));
+			}
+
+			size_t		keyLen = ptr - keyStart;
+			char	   *key = pnstrdup(keyStart, keyLen);
+
+			ptr++;
+
+			/* parse value */
+			char	   *valueStart = ptr;
+
+			while (*ptr != ')' && *ptr != '\0')
+			{
+				ptr++;
+			}
+
+			if (*ptr == '\0')
+			{
+				ereport(ERROR,
+						(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing value")));
+			}
+
+			size_t		valueLen = ptr - valueStart;
+			char	   *value = pnstrdup(valueStart, valueLen);
+
+			if (pg_strcasecmp(key, "min") == 0)
+			{
+				minValue = value;
+			}
+			else if (pg_strcasecmp(key, "max") == 0)
+			{
+				maxValue = value;
+			}
+			else
+			{
+				/* ignore other keys */
+				pfree(value);
+			}
+			pfree(key);
+			ptr++;
+		}
+
+		if (minValue != NULL || maxValue != NULL)
+		{
+			*mins = lappend(*mins, minValue);
+			*maxs = lappend(*maxs, maxValue);
+			*names = lappend(*names, columnName);
+		}
+
+		if (*ptr != '}')
+		{
+			ereport(ERROR,
+					(errmsg("invalid duckdb column min/max format: expected '}' at position %ld", ptr - input)));
+		}
+
+		ptr++;
+
+		/* skip whitespace */
+		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t')
+		{
+			ptr++;
+		}
+
+		if (*ptr != ')')
+		{
+			ereport(ERROR,
+					(errmsg("invalid duckdb column min/max format: expected ')' at position %ld", ptr - input)));
+		}
+
+		ptr++;
+	}
+}
+
+
+static List *
+GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema)
+{
+	List	   *columnStatsList = NIL;
+
+	for (int i = 0; i < schema->nfields; i++)
+	{
+		DataFileSchemaField *field = &schema->fields[i];
+		const char *fieldName = field->name;
+		int			fieldId = field->id;
+
+		ListCell   *nameCell = NULL;
+		int			nameIndex = -1;
+
+		for (int index = 0; index < list_length(names); index++)
+		{
+			char	   *name = list_nth(names, index);
+
+			if (strcmp(name, fieldName) == 0)
+			{
+				nameIndex = index;
+				break;
+			}
+		}
+
+		if (nameIndex == -1)
+		{
+			continue;
+		}
+
+		LeafField  *leafField = NULL;
+		ListCell   *leafCell = NULL;
+
+		foreach(leafCell, leafFields)
+		{
+			LeafField  *lf = lfirst(leafCell);
+
+			if (lf->fieldId == fieldId)
+			{
+				leafField = lf;
+				break;
+			}
+		}
+
+		if (leafField != NULL && nameIndex < list_length(names))
+		{
+			char	   *minStr = list_nth(mins, nameIndex);
+			char	   *maxStr = list_nth(maxs, nameIndex);
+
+			DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
+
+			colStats->leafField = *leafField;
+			colStats->lowerBoundText = pstrdup(minStr);
+			colStats->upperBoundText = pstrdup(maxStr);
+			columnStatsList = lappend(columnStatsList, colStats);
+		}
+	}
+
+	return columnStatsList;
 }
 
 
diff --git a/pg_lake_table/include/pg_lake/fdw/data_file_stats.h b/pg_lake_table/include/pg_lake/fdw/data_file_stats.h
index 44154600..442c6835 100644
--- a/pg_lake_table/include/pg_lake/fdw/data_file_stats.h
+++ b/pg_lake_table/include/pg_lake/fdw/data_file_stats.h
@@ -54,3 +54,4 @@ extern PGDLLEXPORT DataFileColumnStats * CreateDataFileColumnStats(int fieldId,
 																   char *lowerBoundText,
 																   char *upperBoundText);
 extern PGDLLEXPORT void ApplyColumnStatsMode(Oid relationId, List *columnStats);
+extern PGDLLEXPORT void ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats);
diff --git a/pg_lake_table/include/pg_lake/fdw/writable_table.h b/pg_lake_table/include/pg_lake/fdw/writable_table.h
index 37311b81..154579ee 100644
--- a/pg_lake_table/include/pg_lake/fdw/writable_table.h
+++ b/pg_lake_table/include/pg_lake/fdw/writable_table.h
@@ -76,6 +76,7 @@ typedef struct DataFileModification
 
 	/* if the caller already reserved a row ID range, where does it start? */
 	int64		reservedRowIdStart;
+	DataFileStats *fileStats;
 }			DataFileModification;
 
 
diff --git a/pg_lake_table/src/fdw/data_file_stats.c b/pg_lake_table/src/fdw/data_file_stats.c
index 1b89f28a..5767b78b 100644
--- a/pg_lake_table/src/fdw/data_file_stats.c
+++ b/pg_lake_table/src/fdw/data_file_stats.c
@@ -144,6 +144,30 @@ ApplyColumnStatsMode(Oid relationId, List *columnStats)
 }
 
 
+void
+ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats)
+{
+	ColumnStatsConfig columnStatsConfig = GetColumnStatsConfig(relationId);
+
+	ListCell   *dataFileStatsCell = NULL;
+
+	foreach(dataFileStatsCell, dataFileStats)
+	{
+		DataFileStats *dataFileStats = lfirst(dataFileStatsCell);
+
+		ListCell   *columnStatsCell = NULL;
+		foreach(columnStatsCell, dataFileStats->columnStats)
+		{
+			DataFileColumnStats *columnStats = lfirst(columnStatsCell);
+			char	  **lowerBoundText = &columnStats->lowerBoundText;
+			char	  **upperBoundText = &columnStats->upperBoundText;
+
+			ApplyColumnStatsModeForType(columnStatsConfig, columnStats->leafField.pgType, lowerBoundText, upperBoundText);
+		}
+	}
+}
+
+
 /*
  * GetColumnStatsConfig returns the column stats config for the given
  * relation.
diff --git a/pg_lake_table/src/fdw/multi_data_file_dest.c b/pg_lake_table/src/fdw/multi_data_file_dest.c
index 25a2f4cb..eede316c 100644
--- a/pg_lake_table/src/fdw/multi_data_file_dest.c
+++ b/pg_lake_table/src/fdw/multi_data_file_dest.c
@@ -234,6 +234,10 @@ FlushChildDestReceiver(MultiDataFileUploadDestReceiver * self)
 
 		copyModification->partitionSpecId = self->currentPartitionSpecId;
 		copyModification->partition = modification->partition;
+		if (modification->fileStats != NULL)
+		{
+			copyModification->fileStats = DeepCopyDataFileStats(modification->fileStats);
+		}
 
 		/*
 		 * If caller of dest receiver is assigning rowids itself,
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 89007800..9b488474 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -96,13 +96,12 @@ typedef struct CompactionDataFileHashEntry
 
 static List *ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 							 int64 reservedRowIdStart, int32 partitionSpecId,
-							 Partition * partition);
+							 Partition * partition, DataFileStats * fileStats);
 static List *ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount,
 							 int64 liveRowCount, char *deleteFile, int64 deletedRowCount);
-
-static List *FindGeneratedDataFiles(Oid relationId, char *dataFilePath,
-									int32 partitionSpecId, Partition * partition,
-									bool splitFilesBySize, int64 rowCount,
+static DataFileStats * GetDataFileStatsForFilePath(List *dataFileStats, char *filePath);
+static List *FindGeneratedDataFiles(Oid relationId, List *dataFileStats,
+									int32 partitionSpecId, Partition * partition, int64 rowCount,
 									bool isVerbose, List **newFiles);
 static bool ShouldRewriteAfterDeletions(int64 sourceRowCount, uint64 totalDeletedRowCount);
 static CompactionDataFileHashEntry * GetPartitionWithMostEligibleFiles(Oid relationId, TimestampTz compactionStartTime,
@@ -163,7 +162,7 @@ List	   *DeferredModifications = NIL;
 static List *
 ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 				int64 reservedRowIdStart, int32 partitionSpecId,
-				Partition * partition)
+				Partition * partition, DataFileStats * dataFileStats)
 {
 	ereport(WriteLogLevel, (errmsg("adding %s with " INT64_FORMAT " rows ",
 								   insertFile, rowCount)));
@@ -173,8 +172,7 @@ ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 	List	   *options = foreignTable->options;
 	bool		hasRowIds = GetBoolOption(options, "row_ids", false);
 
-	DataFileStats *dataFileStats =
-		CreateDataFileStatsForTable(relationId, insertFile, rowCount, 0, CONTENT_DATA);
+	Assert(dataFileStats != NULL);
 
 	List	   *metadataOperations = NIL;
 
@@ -258,6 +256,8 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 
 	InsertInProgressFileRecordExtended(dataFilePrefix, isPrefix, deferDeletion);
 
+	List	   *dataFileStats = NIL;
+
 	/* convert insert file to a new file in table format */
 	ConvertCSVFileTo(insertCSV,
 					 tupleDescriptor,
@@ -266,7 +266,11 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 					 format,
 					 compression,
 					 options,
-					 schema);
+					 schema,
+					 GetLeafFieldsForTable(relationId),
+					 &dataFileStats);
+
+	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
 
 	/* find which files were generated by DuckDB COPY */
 	List	   *dataFiles = NIL;
@@ -310,6 +314,7 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 		modification->insertFile = dataFilePath;
 		modification->insertedRowCount = rowCount;
 		modification->reservedRowIdStart = reservedRowIdStart;
+		modification->fileStats = GetDataFileStatsForFilePath(dataFileStats, dataFilePath);
 
 		modifications = lappend(modifications, modification);
 	}
@@ -319,58 +324,49 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	return modifications;
 }
 
+static DataFileStats *
+GetDataFileStatsForFilePath(List *dataFileStats, char *filePath)
+{
+	ListCell   *cell = NULL;
+
+	foreach(cell, dataFileStats)
+	{
+		DataFileStats *stats = lfirst(cell);
+
+		if (strcmp(stats->dataFilePath, filePath) == 0)
+			return stats;
+	}
+
+	return NULL;
+}
+
+
 /*
  * FindGeneratedDataFiles gets the list of newly written data files (could
  * be multiple when file_size_bytes is specified) and adds them to the metadata.
  */
 static List *
-FindGeneratedDataFiles(Oid relationId, char *dataFilePath, int32 partitionSpecId, Partition * partition,
-					   bool splitFilesBySize, int64 sourceRowCount, bool isVerbose, List **newFiles)
+FindGeneratedDataFiles(Oid relationId, List *dataFileStats, int32 partitionSpecId, Partition * partition,
+					   int64 rowCount, bool isVerbose, List **newFiles)
 {
-
-	List	   *outputFiles = NIL;
-
-	if (splitFilesBySize)
-	{
-		/* get the list of files generated by DuckDB COPY */
-		outputFiles = ListRemoteFileNames(psprintf("%s/*", dataFilePath));
-	}
-	else
-	{
-		outputFiles = list_make1(dataFilePath);
-	}
-
-	*newFiles = outputFiles;
+	*newFiles = NIL;
 
 	List	   *metadataOperations = NIL;
 
-	ListCell   *outputFileCell = NULL;
+	ListCell   *dataFileStatsCell = NULL;
 
-	foreach(outputFileCell, outputFiles)
+	foreach(dataFileStatsCell, dataFileStats)
 	{
-		char	   *outputFilePath = lfirst(outputFileCell);
-		int64		rowCount;
-
-		/*
-		 * If the file is split, we don't know the per-file row count, so we
-		 * count the rows. This is likely to be quite fast because we only
-		 * split Parquet and by default the files will be cached via
-		 * write-through caching.
-		 */
-		if (list_length(outputFiles) > 1 || sourceRowCount == ROW_COUNT_NOT_SET)
-			rowCount = GetRemoteParquetFileRowCount(outputFilePath);
-		else
-			rowCount = sourceRowCount;
+		DataFileStats *dataFileStats = lfirst(dataFileStatsCell);
 
 		ereport(isVerbose ? INFO : WriteLogLevel,
 				(errmsg("adding %s with " INT64_FORMAT " rows to %s",
-						outputFilePath, rowCount, get_rel_name(relationId))));
-
-		DataFileStats *dataFileStats = CreateDataFileStatsForTable(relationId, outputFilePath, rowCount, 0, CONTENT_DATA);
+						dataFileStats->dataFilePath, dataFileStats->rowCount, get_rel_name(relationId))));
 
+		*newFiles = lappend(*newFiles, dataFileStats->dataFilePath);
 		/* store the new file in the metadata */
 		TableMetadataOperation *addOperation =
-			AddDataFileOperation(outputFilePath, CONTENT_DATA, dataFileStats, partition, partitionSpecId);
+			AddDataFileOperation(dataFileStats->dataFilePath, CONTENT_DATA, dataFileStats, partition, partitionSpecId);
 
 		metadataOperations = lappend(metadataOperations, addOperation);
 	}
@@ -530,19 +526,19 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			uint64		existingDeletedRowCount = sourceRowCount - liveRowCount;
 
 			ReadDataStats stats = {sourceRowCount, existingDeletedRowCount};
+			DataFileStats *newFileStats = NULL;
 
 			PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
 									 deleteFile, newDataFilePath, compression,
-									 schema, &stats);
+									 schema, &stats, GetLeafFieldsForTable(relationId), &newFileStats);
+
+			ApplyColumnStatsModeForAllFileStats(relationId, newFileStats);
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 
 			ereport(WriteLogLevel, (errmsg("adding %s with " INT64_FORMAT " rows ",
 										   newDataFilePath, newRowCount)));
 
-			DataFileStats *newFileStats = CreateDataFileStatsForTable(relationId, newDataFilePath,
-																	  newRowCount, 0, CONTENT_DATA);
-
 			/*
 			 * We are shrinking the data file with the same partition bounds,
 			 * but the file might belong to an old partition spec.
@@ -583,7 +579,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			/* write the deletion file */
 			ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
-							 DATA_FORMAT_PARQUET, compression, copyOptions, schema);
+							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NULL, NULL);
 
 			ereport(WriteLogLevel, (errmsg("adding deletion file %s with " INT64_FORMAT " rows ",
 										   deletionFilePath, deletedRowCount)));
@@ -983,6 +979,7 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 	InsertInProgressFileRecordExtended(newDataFilePath, isPrefix, deferDeletion);
 
 	/* perform compaction */
+	List	   *dataFileStats = NIL;
 	int64		rowCount =
 		WriteQueryResultTo(readQuery,
 						   newDataFilePath,
@@ -991,7 +988,9 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 						   options,
 						   queryHasRowId,
 						   schema,
-						   queryTupleDesc);
+						   queryTupleDesc,
+						   GetLeafFieldsForTable(relationId),
+						   &dataFileStats);
 
 	if (rowCount == 0)
 	{
@@ -1004,11 +1003,13 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 		return NIL;
 	}
 
+	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
+
 	/* find which files were generated */
 	List	   *newFiles = NIL;
-	List	   *newFileOps = FindGeneratedDataFiles(relationId, newDataFilePath,
+	List	   *newFileOps = FindGeneratedDataFiles(relationId, dataFileStats,
 													partitionSpecId, partition,
-													splitFilesBySize, rowCount,
+													rowCount,
 													isVerbose, &newFiles);
 
 	/*
@@ -1242,7 +1243,8 @@ ApplyDataFileModifications(Relation rel, List *modifications)
 								modification->insertedRowCount,
 								modification->reservedRowIdStart,
 								modification->partitionSpecId,
-								modification->partition);
+								modification->partition,
+								modification->fileStats);
 		}
 
 		else

From 92593f3b3cc8f6246c160e2e99878937aa4ca757 Mon Sep 17 00:00:00 2001
From: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
Date: Tue, 16 Dec 2025 16:30:58 +0300
Subject: [PATCH 02/46] Duckdb patch return_stats

Patch duckdb to return stats for columns of boolean and
numeric with precision > 18 and precision <= 38.

Signed-off-by: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
---
 duckdb_pglake/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/duckdb_pglake/Makefile b/duckdb_pglake/Makefile
index 9224938e..c9c61e1d 100644
--- a/duckdb_pglake/Makefile
+++ b/duckdb_pglake/Makefile
@@ -162,3 +162,4 @@ clean: clean_patches
 	rm -rf build
 	rm -rf testext
 	rm -f libduckdb.so
+	rm -f .patches_applied

From 068455c49a80c562c34e8d48cd0e52c90a89418e Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 18 Dec 2025 11:39:37 +0300
Subject: [PATCH 03/46] Fix null schema

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index d42ef2eb..8f06fdf9 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -465,7 +465,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 			char	   *colName = PQfname(result, c);
 			char	   *val = PQgetvalue(result, r, c);
 
-			if (strcmp(colName, "column_statistics") == 0)
+			if (schema != NULL && strcmp(colName, "column_statistics") == 0)
 			{
 				List	   *names = NIL;
 				List	   *mins = NIL;
@@ -674,6 +674,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 {
 	List	   *columnStatsList = NIL;
 
+	Assert(schema != NULL);
 	for (int i = 0; i < schema->nfields; i++)
 	{
 		DataFileSchemaField *field = &schema->fields[i];

From a572d54bb2c6ee3c029caad91e6a4b4ec0bd57b3 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 18 Dec 2025 12:36:12 +0300
Subject: [PATCH 04/46] Null check for min/max values

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/data_file/data_files.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pg_lake_engine/src/data_file/data_files.c b/pg_lake_engine/src/data_file/data_files.c
index ca4f4505..4ddbdd75 100644
--- a/pg_lake_engine/src/data_file/data_files.c
+++ b/pg_lake_engine/src/data_file/data_files.c
@@ -140,8 +140,8 @@ DeepCopyDataFileStats(const DataFileStats * stats)
 			DataFileColumnStats *copiedColStats = palloc0(sizeof(DataFileColumnStats));
 
 			copiedColStats->leafField = DeepCopyLeafField(&colStats->leafField);
-			copiedColStats->lowerBoundText = pstrdup(colStats->lowerBoundText);
-			copiedColStats->upperBoundText = pstrdup(colStats->upperBoundText);
+			copiedColStats->lowerBoundText = colStats->lowerBoundText ? pstrdup(colStats->lowerBoundText): NULL;
+			copiedColStats->upperBoundText = colStats->upperBoundText ? pstrdup(colStats->upperBoundText): NULL;
 
 			copiedStats->columnStats = lappend(copiedStats->columnStats, copiedColStats);
 		}

From 0f16944c8a36c78eb3476e2c8bee54e8a748b5b9 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 18 Dec 2025 12:44:09 +0300
Subject: [PATCH 05/46] Skip statistics for some types

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/pgduck/write_data.h       |  3 +-
 pg_lake_engine/src/init.c                     | 14 ++++++
 pg_lake_engine/src/pgduck/delete_data.c       |  2 +-
 pg_lake_engine/src/pgduck/write_data.c        | 46 +++++++++++++++++--
 .../include/pg_lake/iceberg/iceberg_field.h   |  2 -
 pg_lake_iceberg/src/iceberg/iceberg_field.c   |  3 +-
 pg_lake_iceberg/src/init.c                    | 14 ------
 7 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 6a30e55b..aa17fa93 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -35,6 +35,7 @@ typedef enum ParquetVersion
 
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
+extern PGDLLEXPORT bool EnableStatsCollectionForNestedTypes;
 
 extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
 										 TupleDesc tupleDesc,
@@ -60,4 +61,4 @@ extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
 extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
 														  List *leafFields,
 														  DataFileSchema * schema,
-														  int *totalRowCount);
+														  int64 *totalRowCount);
diff --git a/pg_lake_engine/src/init.c b/pg_lake_engine/src/init.c
index 4e0e41fe..a865bfe9 100644
--- a/pg_lake_engine/src/init.c
+++ b/pg_lake_engine/src/init.c
@@ -42,6 +42,7 @@
 #include "pg_lake/extensions/extension_ids.h"
 #include "pg_lake/pgduck/cache_worker.h"
 #include "pg_lake/pgduck/client.h"
+#include "pg_lake/pgduck/write_data.h"
 #include "utils/guc.h"
 
 PG_MODULE_MAGIC;
@@ -167,6 +168,19 @@ _PG_init(void)
 							GUC_UNIT_S | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
 							NULL, NULL, NULL);
 
+	DefineCustomBoolVariable(
+							 "pg_lake_iceberg.enable_stats_collection_for_nested_types",
+							 gettext_noop("When set to true, stats collection is enabled for nested types."
+										  "We currently do not support pruning for nested types, but you can "
+										  "still get into stats problems with nested types due to parsing "
+										  "discrepancies between Postgres and DuckDB."),
+							 NULL,
+							 &EnableStatsCollectionForNestedTypes,
+							 false,
+							 PGC_SUSET,
+							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
+							 NULL, NULL, NULL);
+
 	if (QueryEngineEnabled)
 	{
 		InitializePgLakeEngineIdCache();
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 4d43ffb8..1786a3bf 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -106,7 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 
 		CheckPGDuckResult(pgDuckConn, result);
 
-		int rowsAffected;
+		int64 rowsAffected;
 		List	   *dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
 
 		Assert(dataFileStats != NIL);
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 8f06fdf9..618e28d8 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -50,6 +50,7 @@ static void AppendFieldIdValue(StringInfo map, Field * field, int fieldId);
 static const char *ParquetVersionToString(ParquetVersion version);
 static void ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
+static bool ShouldSkipStatisticsForField(LeafField *leafField);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
 {
@@ -58,7 +59,7 @@ static DuckDBTypeInfo VARCHAR_TYPE =
 
 int			TargetRowGroupSizeMB = DEFAULT_TARGET_ROW_GROUP_SIZE_MB;
 int			DefaultParquetVersion = PARQUET_VERSION_V1;
-
+bool		EnableStatsCollectionForNestedTypes = false;
 
 /*
  * ConvertCSVFileTo copies and converts a CSV file at source path to
@@ -448,7 +449,7 @@ WriteQueryResultTo(char *query,
 
 
 List *
-GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema, int *totalRowCount)
+GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema, int64 *totalRowCount)
 {
 	List	   *statsList = NIL;
 
@@ -680,8 +681,6 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 		DataFileSchemaField *field = &schema->fields[i];
 		const char *fieldName = field->name;
 		int			fieldId = field->id;
-
-		ListCell   *nameCell = NULL;
 		int			nameIndex = -1;
 
 		for (int index = 0; index < list_length(names); index++)
@@ -707,7 +706,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 		{
 			LeafField  *lf = lfirst(leafCell);
 
-			if (lf->fieldId == fieldId)
+			if (lf->fieldId == fieldId && !ShouldSkipStatisticsForField(lf))
 			{
 				leafField = lf;
 				break;
@@ -732,6 +731,43 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 }
 
 
+static bool
+ShouldSkipStatisticsForField(LeafField *leafField)
+{
+	Field	   *field = leafField->field;
+	PGType		pgType = leafField->pgType;
+
+	Oid			pgTypeOid = pgType.postgresTypeOid;
+
+	if (IsGeometryTypeId(pgType.postgresTypeOid))
+	{
+		return true;
+	}
+	else if (strcmp(field->field.scalar.typeName, "string") == 0 &&
+			 pgType.postgresTypeOid != TEXTOID &&
+			 pgTypeOid != VARCHAROID &&
+			 pgTypeOid != BPCHAROID &&
+			 pgTypeOid != CHAROID)
+	{
+		return true;
+	}
+	else if (pgTypeOid == BYTEAOID)
+	{
+		return true;
+	}
+	else if (pgTypeOid == UUIDOID)
+	{
+		return true;
+	}
+	else if (leafField->level != 1)
+	{
+		return !EnableStatsCollectionForNestedTypes;
+	}
+
+	return false;
+}
+
+
 /*
  * TupleDescToProjectionList converts a PostgreSQL tuple descriptor to
  * projection list in string form that can be used for writes.
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
index bc161fd6..caf74ff9 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
@@ -23,8 +23,6 @@
 #include "pg_lake/pgduck/type.h"
 #include "pg_lake/parquet/leaf_field.h"
 
-extern bool EnableStatsCollectionForNestedTypes;
-
 extern PGDLLEXPORT PGType IcebergFieldToPostgresType(Field * field);
 extern PGDLLEXPORT Field * PostgresTypeToIcebergField(PGType pgType,
 													  bool forAddColumn,
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 3fb865e8..3ffa5fef 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -46,6 +46,7 @@
 #include "pg_lake/pgduck/numeric.h"
 #include "pg_lake/pgduck/serialize.h"
 #include "pg_lake/pgduck/type.h"
+#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/string_utils.h"
 
 #include "access/table.h"
@@ -59,8 +60,6 @@
 #include "utils/rel.h"
 #include "utils/typcache.h"
 
-bool		EnableStatsCollectionForNestedTypes = false;
-
 typedef enum IcebergType
 {
 	ICEBERG_TYPE_INVALID,
diff --git a/pg_lake_iceberg/src/init.c b/pg_lake_iceberg/src/init.c
index d2746745..9fab9f9c 100644
--- a/pg_lake_iceberg/src/init.c
+++ b/pg_lake_iceberg/src/init.c
@@ -126,20 +126,6 @@ _PG_init(void)
 							PGC_SIGHUP, GUC_UNIT_MS,
 							NULL, NULL, NULL);
 
-	DefineCustomBoolVariable(
-							 "pg_lake_iceberg.enable_stats_collection_for_nested_types",
-							 gettext_noop("When set to true, stats collection is enabled for nested types."
-										  "We currently do not support pruning for nested types, but you can "
-										  "still get into stats problems with nested types due to parsing "
-										  "discrepancies between Postgres and DuckDB."),
-							 NULL,
-							 &EnableStatsCollectionForNestedTypes,
-							 false,
-							 PGC_SUSET,
-							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
-							 NULL, NULL, NULL);
-
-
 	DefineCustomBoolVariable(
 							 "pg_lake_iceberg.http_client_trace_traffic",
 							 gettext_noop("When set to true, HTTP client logging is enabled."),

From 9f206dab7d6f342c6e27b1dd74ae955f1354d8a2 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 22 Dec 2025 14:04:12 +0300
Subject: [PATCH 06/46] Add schema==NULL check for column stats

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 618e28d8..bc444825 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -264,8 +264,11 @@ WriteQueryResultTo(char *query,
 				appendStringInfo(&command, ", parquet_version '%s'",
 								 ParquetVersionToString(DefaultParquetVersion));
 
-				appendStringInfo(&command, ", return_stats");
-				useReturnStats = true;
+				if (schema != NULL)
+				{
+					appendStringInfo(&command, ", return_stats");
+					useReturnStats = true;
+				}
 
 				break;
 			}

From 85bdfb847e765b2365f86bbeaefd43ee429a27d0 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 22 Dec 2025 17:00:41 +0300
Subject: [PATCH 07/46] Fallback to previous mechanism it stats are null

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 7 ++-----
 pg_lake_table/src/fdw/writable_table.c | 5 ++++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index bc444825..618e28d8 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -264,11 +264,8 @@ WriteQueryResultTo(char *query,
 				appendStringInfo(&command, ", parquet_version '%s'",
 								 ParquetVersionToString(DefaultParquetVersion));
 
-				if (schema != NULL)
-				{
-					appendStringInfo(&command, ", return_stats");
-					useReturnStats = true;
-				}
+				appendStringInfo(&command, ", return_stats");
+				useReturnStats = true;
 
 				break;
 			}
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 9b488474..e40d5c16 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -172,7 +172,10 @@ ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 	List	   *options = foreignTable->options;
 	bool		hasRowIds = GetBoolOption(options, "row_ids", false);
 
-	Assert(dataFileStats != NULL);
+	if (dataFileStats == NULL)
+	{
+		dataFileStats = CreateDataFileStatsForTable(relationId, insertFile, rowCount, 0, CONTENT_DATA);
+	}
 
 	List	   *metadataOperations = NIL;
 

From 8b12b809c66bafc1f1528b699b6677b4ec842476 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 22 Dec 2025 18:06:53 +0300
Subject: [PATCH 08/46] Fix: Make list from file stats

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index e40d5c16..c1ba639f 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -535,7 +535,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 									 deleteFile, newDataFilePath, compression,
 									 schema, &stats, GetLeafFieldsForTable(relationId), &newFileStats);
 
-			ApplyColumnStatsModeForAllFileStats(relationId, newFileStats);
+			ApplyColumnStatsModeForAllFileStats(relationId, list_make1(newFileStats));
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 

From 19185f352aabaf07849da16a725e1194b53b2587 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 23 Dec 2025 12:26:33 +0300
Subject: [PATCH 09/46] Skip tests for nested fields

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../pytests/test_iceberg_data_file_stats.py   | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py b/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
index 72d6c087..6e983504 100644
--- a/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
+++ b/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
@@ -75,10 +75,10 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_metadata(
                 "11": "2021-01-01T00:00:00",
                 "12": "a",
                 "13": "abc",
-                "15": 1,
-                "17": 1,
-                "19": 1,
-                "20": 2,
+                # "15": 1,
+                # "17": 1,
+                # "19": 1,
+                # "20": 2,
             },
             {
                 "1": "San Francisco",
@@ -94,10 +94,10 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_metadata(
                 "11": "2021-01-04T00:00:00",
                 "12": "d",
                 "13": "jkl",
-                "15": 12,
-                "17": 4,
-                "19": 7,
-                "20": 8,
+                # "15": 12,
+                # "17": 4,
+                # "19": 7,
+                # "20": 8,
             },
         ]
     ]
@@ -133,10 +133,10 @@ def test_pg_lake_iceberg_table_reserialize_data_file_stats_from_metadata(
                 "11": "\\x0080e56bcbb70500",
                 "12": "\\x61",
                 "13": "\\x616263",
-                "15": "\\x01000000",
-                "17": "\\x01000000",
-                "19": "\\x01000000",
-                "20": "\\x02000000",
+                # "15": "\\x01000000",
+                # "17": "\\x01000000",
+                # "19": "\\x01000000",
+                # "20": "\\x02000000",
             },
             {
                 "1": "\\x53616e204672616e636973636f",
@@ -152,10 +152,10 @@ def test_pg_lake_iceberg_table_reserialize_data_file_stats_from_metadata(
                 "11": "\\x00a06bc507b80500",
                 "12": "\\x64",
                 "13": "\\x6a6b6c",
-                "15": "\\x0c000000",
-                "17": "\\x04000000",
-                "19": "\\x07000000",
-                "20": "\\x08000000",
+                # "15": "\\x0c000000",
+                # "17": "\\x04000000",
+                # "19": "\\x07000000",
+                # "20": "\\x08000000",
             },
         ]
     ]
@@ -183,7 +183,7 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [3, 3, "-122.431297", "6.0989"],
         [3, 4, "1", "7"],
         [3, 5, "2", "8"],
-        [3, 6, "f", "t"],
+        [3, 6, "0", "1"],
         [3, 7, "2021-01-01", "2021-01-04"],
         [3, 8, "2021-01-01 04:00:00+00", "2021-01-04 04:00:00+00"],
         [3, 9, "-6403.01", "123.01"],
@@ -191,10 +191,10 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [3, 11, "2021-01-01 00:00:00", "2021-01-04 00:00:00"],
         [3, 12, "a", "d"],
         [3, 13, "abc", "jkl"],
-        [3, 15, "1", "12"],
-        [3, 17, "1", "4"],
-        [3, 19, "1", "7"],
-        [3, 20, "2", "8"],
+        # [3, 15, "1", "12"],
+        # [3, 17, "1", "4"],
+        # [3, 19, "1", "7"],
+        # [3, 20, "2", "8"],
     ]
 
     table_name = f"{PG_LAKE_TABLE_NAMESPACE}.{PG_LAKE_TABLE_NAME}"
@@ -209,7 +209,7 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [3, 3, "-122.431297", "6.0989"],
         [3, 4, "1", "7"],
         [3, 5, "2", "8"],
-        [3, 6, "f", "t"],
+        [3, 6, "0", "1"],
         [3, 7, "2021-01-01", "2021-01-04"],
         [3, 8, "2021-01-01 04:00:00+00", "2021-01-04 04:00:00+00"],
         [3, 9, "-6403.01", "123.01"],
@@ -217,8 +217,8 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [3, 11, "2021-01-01 00:00:00", "2021-01-04 00:00:00"],
         [3, 12, "a", "d"],
         [3, 13, "abc", "jkl"],
-        [3, 15, "1", "12"],
-        [3, 17, "1", "4"],
+        # [3, 15, "1", "12"],
+        # [3, 17, "1", "4"],
         [4, 2, "37.77397", "53.11254"],
     ]
 
@@ -240,7 +240,7 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [5, 3, "-122.431297", "6.0989"],
         [5, 4, "1", "7"],
         [5, 5, "2", "8"],
-        [5, 6, "f", "t"],
+        [5, 6, "0", "1"],
         [5, 7, "2021-01-01", "2021-01-04"],
         [5, 8, "2021-01-01 04:00:00+00", "2021-01-04 04:00:00+00"],
         [5, 9, "-6403.01", "123.01"],
@@ -861,7 +861,7 @@ def test_pg_lake_iceberg_table_random_values(
     pg_conn.commit()
 
 
-def test_pg_lake_iceberg_table_complex_values(
+def skippedtest_pg_lake_iceberg_table_complex_values(
     superuser_conn,
     enable_stats_for_nested_types,
     extension,

From 36c2048e9f00163eecba29401b21ffd5e30908dc Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 23 Dec 2025 13:00:54 +0300
Subject: [PATCH 10/46] Do not use enable stats guc

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/include/pg_lake/pgduck/write_data.h | 2 +-
 pg_lake_engine/src/init.c                          | 2 +-
 pg_lake_engine/src/pgduck/write_data.c             | 4 ++--
 pg_lake_iceberg/src/iceberg/iceberg_field.c        | 8 ++------
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index aa17fa93..aae7bd94 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -35,7 +35,7 @@ typedef enum ParquetVersion
 
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
-extern PGDLLEXPORT bool EnableStatsCollectionForNestedTypes;
+extern PGDLLEXPORT bool DeprecatedEnableStatsCollectionForNestedTypes;
 
 extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
 										 TupleDesc tupleDesc,
diff --git a/pg_lake_engine/src/init.c b/pg_lake_engine/src/init.c
index a865bfe9..5da5933e 100644
--- a/pg_lake_engine/src/init.c
+++ b/pg_lake_engine/src/init.c
@@ -175,7 +175,7 @@ _PG_init(void)
 										  "still get into stats problems with nested types due to parsing "
 										  "discrepancies between Postgres and DuckDB."),
 							 NULL,
-							 &EnableStatsCollectionForNestedTypes,
+							 &DeprecatedEnableStatsCollectionForNestedTypes,
 							 false,
 							 PGC_SUSET,
 							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 618e28d8..f8b6dabf 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -59,7 +59,7 @@ static DuckDBTypeInfo VARCHAR_TYPE =
 
 int			TargetRowGroupSizeMB = DEFAULT_TARGET_ROW_GROUP_SIZE_MB;
 int			DefaultParquetVersion = PARQUET_VERSION_V1;
-bool		EnableStatsCollectionForNestedTypes = false;
+bool		DeprecatedEnableStatsCollectionForNestedTypes = false;
 
 /*
  * ConvertCSVFileTo copies and converts a CSV file at source path to
@@ -761,7 +761,7 @@ ShouldSkipStatisticsForField(LeafField *leafField)
 	}
 	else if (leafField->level != 1)
 	{
-		return !EnableStatsCollectionForNestedTypes;
+		return true;
 	}
 
 	return false;
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 3ffa5fef..346a9f7d 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -1355,13 +1355,9 @@ ShouldSkipStatistics(LeafField * leafField)
 	{
 		/*
 		 * We currently do not support pruning on array, map, and composite
-		 * types. But still, you can get into stats problems with nested types
-		 * due to the way DuckDB parses commas in the array. For example, if
-		 * you have array['hello', 'world', 'abc,def'], the lower bound
-		 * becomes 'abc' not 'abc,def'. So, be careful when enabling nested
-		 * types.
+		 * types. So there's no need to collect stats for them.
 		 */
-		return !EnableStatsCollectionForNestedTypes;
+		return true;
 	}
 
 	return false;

From 393f6ea73713cb232ca01bfbf7d87ce78bed62e1 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 23 Dec 2025 13:10:11 +0300
Subject: [PATCH 11/46] fixup

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py b/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
index 6e983504..def8dc6d 100644
--- a/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
+++ b/pg_lake_iceberg/tests/pytests/test_iceberg_data_file_stats.py
@@ -248,8 +248,8 @@ def test_pg_lake_iceberg_table_read_data_file_stats_from_catalog(
         [5, 11, "2021-01-01 00:00:00", "2021-01-04 00:00:00"],
         [5, 12, "a", "d"],
         [5, 13, "abc", "jkl"],
-        [5, 15, "1", "12"],
-        [5, 17, "1", "4"],
+        # [5, 15, "1", "12"],
+        # [5, 17, "1", "4"],
     ]
 
     # remove data files. then stats should be empty due to foreign key constraint

From f95d58e57d2d977df96fc60e5bbca1faaafefe48 Mon Sep 17 00:00:00 2001
From: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
Date: Tue, 23 Dec 2025 16:23:17 +0300
Subject: [PATCH 12/46] parse return_stats output to map type

Signed-off-by: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 287 ++++++++++++++-----------
 1 file changed, 160 insertions(+), 127 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index f8b6dabf..eb8c069f 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -19,18 +19,23 @@
  * Functions for generating query for writing data via pgduck server.
  */
 #include "postgres.h"
+#include "fmgr.h"
 
 #include "access/tupdesc.h"
+#include "catalog/pg_type.h"
 #include "commands/defrem.h"
 #include "common/string.h"
+#include "executor/executor.h"
 #include "pg_lake/csv/csv_options.h"
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/data_file/data_file_stats.h"
+#include "pg_lake/extensions/pg_map.h"
 #include "pg_lake/extensions/postgis.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/parquet/geoparquet.h"
 #include "pg_lake/parsetree/options.h"
 #include "pg_lake/pgduck/client.h"
+#include "pg_lake/pgduck/map.h"
 #include "pg_lake/pgduck/numeric.h"
 #include "pg_lake/pgduck/read_data.h"
 #include "pg_lake/pgduck/type.h"
@@ -49,8 +54,11 @@ static DuckDBTypeInfo ChooseDuckDBEngineTypeForWrite(PGType postgresType,
 static void AppendFieldIdValue(StringInfo map, Field * field, int fieldId);
 static const char *ParquetVersionToString(ParquetVersion version);
 static void ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs);
+static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs);
+static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
+static char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
-static bool ShouldSkipStatisticsForField(LeafField *leafField);
+static bool ShouldSkipStatisticsForField(LeafField * leafField);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
 {
@@ -455,6 +463,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 
 	int			rowCount = PQntuples(result);
 	int			columnCount = PQnfields(result);
+
 	*totalRowCount = 0;
 
 	for (int r = 0; r < rowCount; r++)
@@ -497,176 +506,200 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 }
 
 
+/*
+ * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
+ * of type map(varchar,varchar).
+ */
 static void
-ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs)
+ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs)
 {
-	input = PgLakeReplaceText(pstrdup(input), "\"", "");
-	input = PgLakeReplaceText(pstrdup(input), "\\", "");
+	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
+
+	if (elementsArray == NULL)
+		return;
+
+	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
 
-	char	   *ptr = (char *) input + 1;
+	if (numElements == 0)
+		return;
 
-	while (*ptr != '\0')
+	char	   *minText = NULL;
+	char	   *maxText = NULL;
+
+	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
+	Datum		elemDatum;
+	bool		isNull = false;
+
+	while (array_iterate(arrayIterator, &elemDatum, &isNull))
 	{
-		/* skip whitespace and commas */
-		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t' || *ptr == ',')
-		{
-			ptr++;
-		}
+		if (isNull)
+			continue;
 
-		if (*ptr == '\0' || *ptr == '}')
-		{
-			break;
-		}
+		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
+		bool		statsKeyIsNull = false;
+		bool		statsValIsNull = false;
 
-		if (*ptr != '(')
-		{
-			ereport(ERROR,
-					(errmsg("invalid duckdb column min/max format: expected '(' at position %ld", ptr - input)));
-		}
+		Datum		statsKeyDatum = GetAttributeByNum(tupleHeader, 1, &statsKeyIsNull);
+		Datum		statsValDatum = GetAttributeByNum(tupleHeader, 2, &statsValIsNull);
 
-		ptr++;
+		/* skip entries without a key or value */
+		if (statsKeyIsNull || statsValIsNull)
+			continue;
 
-		/* parse column name */
-		char	   *nameStart = ptr;
+		char	   *statsKey = TextDatumGetCString(statsKeyDatum);
 
-		while (*ptr != ',' && *ptr != '\0')
+		if (strcmp(statsKey, "min") == 0)
 		{
-			ptr++;
+			Assert(minText == NULL);
+			minText = TextDatumGetCString(statsValDatum);
 		}
-
-		if (*ptr == '\0')
+		else if (strcmp(statsKey, "max") == 0)
 		{
-			ereport(ERROR,
-					(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing column name")));
+			Assert(maxText == NULL);
+			maxText = TextDatumGetCString(statsValDatum);
 		}
+	}
 
-		size_t		nameLen = ptr - nameStart;
+	if (minText != NULL || maxText != NULL)
+	{
+		*names = lappend(*names, pstrdup(colName));
+		*mins = lappend(*mins, minText);
+		*maxs = lappend(*maxs, maxText);
+	}
 
-		char	   *columnName = pnstrdup(nameStart, nameLen);
+	array_free_iterator(arrayIterator);
+}
 
-		ptr++;
 
-		/* skip whitespace */
-		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t')
-		{
-			ptr++;
-		}
+/*
+ * UnescapeDoubleQuotes unescapes any doubled quotes.
+ * e.g. "ab\"\"cd\"\"ee" becomes "ab\"cd\"ee"
+ */
+static char *
+UnescapeDoubleQuotes(const char *s)
+{
+	if (s == NULL)
+		return NULL;
 
-		if (*ptr != '{')
-		{
-			ereport(ERROR,
-					(errmsg("invalid duckdb column min/max format: expected '{' at position %ld", ptr - input)));
-		}
+	char		doubleQuote = '"';
 
-		ptr++;
+	int			len = strlen(s);
 
-		char	   *minValue = NULL;
-		char	   *maxValue = NULL;
+	if (len >= 2 && (s[0] == doubleQuote && s[len - 1] == doubleQuote))
+	{
+		/* Allocate worst-case length (without surrounding quotes) + 1 */
+		char	   *out = palloc((len - 1) * sizeof(char));
+		int			oi = 0;
 
-		/* parse key-value pairs inside the braces */
-		while (*ptr != '}' && *ptr != '\0')
+		for (int i = 1; i < len - 1; i++)
 		{
-			/* skip whitespace and commas */
-			while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t' || *ptr == ',')
+			/* Handle "" */
+			if (s[i] == doubleQuote && i + 1 < len - 1 && s[i + 1] == doubleQuote)
 			{
-				ptr++;
+				out[oi++] = doubleQuote;
+				i++;			/* skip the doubled quote */
 			}
-
-			if (*ptr == '}')
+			else
 			{
-				break;
+				out[oi++] = s[i];
 			}
+		}
 
-			if (*ptr != '(')
-			{
-				ereport(ERROR,
-						(errmsg("invalid duckdb column min/max format: expected '(' at position %ld", ptr - input)));
-			}
+		out[oi] = '\0';
+		return out;
+	}
 
-			ptr++;
+	return pstrdup(s);
+}
 
-			/* parse key */
-			char	   *keyStart = ptr;
 
-			while (*ptr != ',' && *ptr != '\0')
-			{
-				ptr++;
-			}
+/*
+ * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
+ * of type map(text,text).
+ */
+static void
+ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
+{
+	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
 
-			if (*ptr == '\0')
-			{
-				ereport(ERROR,
-						(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing key")));
-			}
+	if (elementsArray == NULL)
+		return;
 
-			size_t		keyLen = ptr - keyStart;
-			char	   *key = pnstrdup(keyStart, keyLen);
+	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
 
-			ptr++;
+	if (numElements == 0)
+		return;
 
-			/* parse value */
-			char	   *valueStart = ptr;
+	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
+	Datum		elemDatum;
+	bool		isNull = false;
 
-			while (*ptr != ')' && *ptr != '\0')
-			{
-				ptr++;
-			}
+	while (array_iterate(arrayIterator, &elemDatum, &isNull))
+	{
+		if (isNull)
+			continue;
 
-			if (*ptr == '\0')
-			{
-				ereport(ERROR,
-						(errmsg("invalid duckdb column min/max format: unexpected end of input while parsing value")));
-			}
+		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
+		bool		colNameIsNull = false;
+		bool		colStatsIsNull = false;
 
-			size_t		valueLen = ptr - valueStart;
-			char	   *value = pnstrdup(valueStart, valueLen);
+		Datum		colNameDatum = GetAttributeByNum(tupleHeader, 1, &colNameIsNull);
+		Datum		colStatsDatum = GetAttributeByNum(tupleHeader, 2, &colStatsIsNull);
 
-			if (pg_strcasecmp(key, "min") == 0)
-			{
-				minValue = value;
-			}
-			else if (pg_strcasecmp(key, "max") == 0)
-			{
-				maxValue = value;
-			}
-			else
-			{
-				/* ignore other keys */
-				pfree(value);
-			}
-			pfree(key);
-			ptr++;
-		}
+		/* skip entries without a key or value */
+		if (colNameIsNull || colStatsIsNull)
+			continue;
 
-		if (minValue != NULL || maxValue != NULL)
-		{
-			*mins = lappend(*mins, minValue);
-			*maxs = lappend(*maxs, maxValue);
-			*names = lappend(*names, columnName);
-		}
+		char	   *colName = TextDatumGetCString(colNameDatum);
 
-		if (*ptr != '}')
-		{
-			ereport(ERROR,
-					(errmsg("invalid duckdb column min/max format: expected '}' at position %ld", ptr - input)));
-		}
+		/*
+		 * pg_map text key is escaped for double quotes. We need to unescape
+		 * them.
+		 */
+		char	   *unescapedColName = UnescapeDoubleQuotes(colName);
 
-		ptr++;
+		ExtractMinMaxForColumn(colStatsDatum, unescapedColName, names, mins, maxs);
+	}
 
-		/* skip whitespace */
-		while (*ptr == ' ' || *ptr == '\n' || *ptr == '\t')
-		{
-			ptr++;
-		}
+	array_free_iterator(arrayIterator);
+}
 
-		if (*ptr != ')')
-		{
-			ereport(ERROR,
-					(errmsg("invalid duckdb column min/max format: expected ')' at position %ld", ptr - input)));
-		}
 
-		ptr++;
-	}
+/*
+ * ParseDuckdbColumnMinMaxFromText parses COPY .. TO .parquet WITH (return_stats)
+ * output text to map(text, map(text,text)).
+ * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...},
+ * 		  'name_col' => {'min' => 'aykut', 'max' => 'onder', ...},
+ *         ...
+ * 		}
+ */
+static void
+ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs)
+{
+	/*
+	 * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...}, 'name_col' =>
+	 * {'min' => 'aykut', 'max' => 'onder', ...}, ... }
+	 */
+	Oid			returnStatsMapId = GetOrCreatePGMapType("MAP(TEXT,MAP(TEXT,TEXT))");
+
+	if (returnStatsMapId == InvalidOid)
+		ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						errmsg("unexpected return_stats result %s", input)));
+
+	/* parse result into map above */
+	Oid			typinput;
+	Oid			typioparam;
+
+	getTypeInputInfo(returnStatsMapId, &typinput, &typioparam);
+
+	Datum		statsMapDatum = OidInputFunctionCall(typinput, pstrdup(input), typioparam, -1);
+
+	/*
+	 * extract min and max for each column: iterate the underlying map datum
+	 * directly to avoid invoking the set-returning `entries()` function in a
+	 * non-SRF context.
+	 */
+	ExtractMinMaxForAllColumns(statsMapDatum, names, mins, maxs);
 }
 
 
@@ -732,7 +765,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 
 
 static bool
-ShouldSkipStatisticsForField(LeafField *leafField)
+ShouldSkipStatisticsForField(LeafField * leafField)
 {
 	Field	   *field = leafField->field;
 	PGType		pgType = leafField->pgType;

From 135672f5477a6695405639284011207e129c81eb Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 23 Dec 2025 20:32:24 +0300
Subject: [PATCH 13/46] Add map type to parse duckdb result

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/pg_lake_engine--3.0--3.1.sql | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
index 5d38899b..9799f6f7 100644
--- a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
+++ b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
@@ -33,3 +33,6 @@ CREATE FUNCTION __lake__internal__nsp__.from_hex(text)
  LANGUAGE C
  IMMUTABLE PARALLEL SAFE STRICT
 AS 'MODULE_PATHNAME', $function$pg_lake_internal_dummy_function$function$;
+
+SELECT map_type.create('TEXT','TEXT');
+SELECT map_type.create('TEXT','map_type.key_text_val_text');

From adc97a64e40804a7c6a6282435e93e2da94665c4 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 30 Dec 2025 11:50:29 +0300
Subject: [PATCH 14/46] Remove unnecessary deepcopy for stats

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/delete_data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 1786a3bf..aff9e075 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -110,7 +110,7 @@ PerformDeleteFromParquet(char *sourcePath,
 		List	   *dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
 
 		Assert(dataFileStats != NIL);
-		*newFileStats = DeepCopyDataFileStats((DataFileStats *) linitial(dataFileStats));
+		*newFileStats = linitial(dataFileStats);
 
 		PQclear(result);
 	}

From 94cb814488de916dc388c69029a506a8afde8863 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 30 Dec 2025 11:53:21 +0300
Subject: [PATCH 15/46] Add comments

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index eb8c069f..9b078f0f 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -456,6 +456,12 @@ WriteQueryResultTo(char *query,
 }
 
 
+/*
+ * GetDataFileStatsListFromPGResult extracts DataFileStats list from the
+ * given PGresult of COPY .. TO ... WITH (return_stats).
+ *
+ * It also returns the total row count via totalRowCount output parameter.
+ */
 List *
 GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema, int64 *totalRowCount)
 {
@@ -703,6 +709,10 @@ ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, Li
 }
 
 
+/*
+ * GetDataFileColumnStatsList builds DataFileColumnStats list from given
+ * names, mins, maxs lists and schema.
+ */
 static List *
 GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema)
 {
@@ -764,6 +774,10 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 }
 
 
+/*
+ * ShouldSkipStatisticsForField determines whether statistics should be
+ * skipped for the given leaf field.
+ */
 static bool
 ShouldSkipStatisticsForField(LeafField * leafField)
 {

From d1f5a8937768b76c1d4394f121e6e9f52afe2a65 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 30 Dec 2025 12:59:02 +0300
Subject: [PATCH 16/46] Minor

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 duckdb_pglake/Makefile                      |  1 -
 pg_lake_engine/pg_lake_engine--3.0--3.1.sql |  1 +
 pg_lake_engine/src/pgduck/write_data.c      | 54 ++++++++++-----------
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/duckdb_pglake/Makefile b/duckdb_pglake/Makefile
index c9c61e1d..9224938e 100644
--- a/duckdb_pglake/Makefile
+++ b/duckdb_pglake/Makefile
@@ -162,4 +162,3 @@ clean: clean_patches
 	rm -rf build
 	rm -rf testext
 	rm -f libduckdb.so
-	rm -f .patches_applied
diff --git a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
index 9799f6f7..31cdffb3 100644
--- a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
+++ b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
@@ -34,5 +34,6 @@ CREATE FUNCTION __lake__internal__nsp__.from_hex(text)
  IMMUTABLE PARALLEL SAFE STRICT
 AS 'MODULE_PATHNAME', $function$pg_lake_internal_dummy_function$function$;
 
+-- Register map types, will be used for parsing DuckDB maps
 SELECT map_type.create('TEXT','TEXT');
 SELECT map_type.create('TEXT','map_type.key_text_val_text');
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 9b078f0f..881c8cde 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -177,7 +177,6 @@ WriteQueryResultTo(char *query,
 				   List **dataFileStats)
 {
 	StringInfoData command;
-	bool		useReturnStats = false;
 
 	initStringInfo(&command);
 
@@ -273,7 +272,6 @@ WriteQueryResultTo(char *query,
 								 ParquetVersionToString(DefaultParquetVersion));
 
 				appendStringInfo(&command, ", return_stats");
-				useReturnStats = true;
 
 				break;
 			}
@@ -425,7 +423,7 @@ WriteQueryResultTo(char *query,
 		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
 		CheckPGDuckResult(pgDuckConn, result);
 
-		if (useReturnStats && dataFileStats != NULL)
+		if (destinationFormat == DATA_FORMAT_PARQUET && dataFileStats != NULL)
 		{
 			/* DuckDB returns COPY 0 when return_stats is used. */
 			*dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
@@ -467,41 +465,41 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 {
 	List	   *statsList = NIL;
 
-	int			rowCount = PQntuples(result);
-	int			columnCount = PQnfields(result);
+	int			resultRowCount = PQntuples(result);
+	int			resultColumnCount = PQnfields(result);
 
 	*totalRowCount = 0;
 
-	for (int r = 0; r < rowCount; r++)
+	for (int resultRowIndex = 0; resultRowIndex < resultRowCount; resultRowIndex++)
 	{
 		DataFileStats *fileStats = palloc0(sizeof(DataFileStats));
 
-		for (int c = 0; c < columnCount; c++)
+		for (int resultColIndex = 0; resultColIndex < resultColumnCount; resultColIndex++)
 		{
-			char	   *colName = PQfname(result, c);
-			char	   *val = PQgetvalue(result, r, c);
+			char	   *resultColName = PQfname(result, resultColIndex);
+			char	   *resultValue = PQgetvalue(result, resultRowIndex, resultColIndex);
 
-			if (schema != NULL && strcmp(colName, "column_statistics") == 0)
+			if (schema != NULL && strcmp(resultColName, "column_statistics") == 0)
 			{
 				List	   *names = NIL;
 				List	   *mins = NIL;
 				List	   *maxs = NIL;
 
-				ParseDuckdbColumnMinMaxFromText(val, &names, &mins, &maxs);
+				ParseDuckdbColumnMinMaxFromText(resultValue, &names, &mins, &maxs);
 				fileStats->columnStats = GetDataFileColumnStatsList(names, mins, maxs, leafFields, schema);
 			}
-			else if (strcmp(colName, "file_size_bytes") == 0)
+			else if (strcmp(resultColName, "file_size_bytes") == 0)
 			{
-				fileStats->fileSize = atoll(val);
+				fileStats->fileSize = atoll(resultValue);
 			}
-			else if (strcmp(colName, "count") == 0)
+			else if (strcmp(resultColName, "count") == 0)
 			{
-				fileStats->rowCount = atoll(val);
+				fileStats->rowCount = atoll(resultValue);
 				*totalRowCount += fileStats->rowCount;
 			}
-			else if (strcmp(colName, "filename") == 0)
+			else if (strcmp(resultColName, "filename") == 0)
 			{
-				fileStats->dataFilePath = pstrdup(val);
+				fileStats->dataFilePath = pstrdup(resultValue);
 			}
 		}
 
@@ -566,7 +564,7 @@ ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins
 		}
 	}
 
-	if (minText != NULL || maxText != NULL)
+	if (minText != NULL && maxText != NULL)
 	{
 		*names = lappend(*names, pstrdup(colName));
 		*mins = lappend(*mins, minText);
@@ -719,25 +717,25 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 	List	   *columnStatsList = NIL;
 
 	Assert(schema != NULL);
-	for (int i = 0; i < schema->nfields; i++)
+	for (int fieldIndex = 0; fieldIndex < schema->nfields; fieldIndex++)
 	{
-		DataFileSchemaField *field = &schema->fields[i];
+		DataFileSchemaField *field = &schema->fields[fieldIndex];
 		const char *fieldName = field->name;
 		int			fieldId = field->id;
-		int			nameIndex = -1;
+		int			nameIndexFound = -1;
 
-		for (int index = 0; index < list_length(names); index++)
+		for (int nameIndex = 0; nameIndex < list_length(names); nameIndex++)
 		{
-			char	   *name = list_nth(names, index);
+			char	   *name = list_nth(names, nameIndex);
 
 			if (strcmp(name, fieldName) == 0)
 			{
-				nameIndex = index;
+				nameIndexFound = nameIndex;
 				break;
 			}
 		}
 
-		if (nameIndex == -1)
+		if (nameIndexFound == -1)
 		{
 			continue;
 		}
@@ -756,10 +754,10 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 			}
 		}
 
-		if (leafField != NULL && nameIndex < list_length(names))
+		if (leafField != NULL)
 		{
-			char	   *minStr = list_nth(mins, nameIndex);
-			char	   *maxStr = list_nth(maxs, nameIndex);
+			char	   *minStr = list_nth(mins, nameIndexFound);
+			char	   *maxStr = list_nth(maxs, nameIndexFound);
 
 			DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
 

From a93d356c72566af2a5b4b58aa94ab5666d86e08c Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 5 Jan 2026 13:17:43 +0300
Subject: [PATCH 17/46] Use names from file stats instead of
 ListRemoteFileNames

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 58 ++++++++++++--------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index c1ba639f..169bc871 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -99,7 +99,7 @@ static List *ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 							 Partition * partition, DataFileStats * fileStats);
 static List *ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount,
 							 int64 liveRowCount, char *deleteFile, int64 deletedRowCount);
-static DataFileStats * GetDataFileStatsForFilePath(List *dataFileStats, char *filePath);
+static List *GetDataFilePathsFromStatsList(List *dataFileStats);
 static List *FindGeneratedDataFiles(Oid relationId, List *dataFileStats,
 									int32 partitionSpecId, Partition * partition, int64 rowCount,
 									bool isVerbose, List **newFiles);
@@ -275,16 +275,16 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 
 	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
 
-	/* find which files were generated by DuckDB COPY */
-	List	   *dataFiles = NIL;
-
-	if (splitFilesBySize)
+	if (!splitFilesBySize)
 	{
-		dataFiles = ListRemoteFileNames(psprintf("%s/*", dataFilePrefix));
-	}
-	else
-	{
-		dataFiles = list_make1(dataFilePrefix);
+		/* early return a single modification if not splitting files by size */
+		DataFileModification *modification = palloc0(sizeof(DataFileModification));
+		modification->type = ADD_DATA_FILE;
+		modification->insertFile = dataFilePrefix;
+		modification->insertedRowCount = rowCount;
+		modification->reservedRowIdStart = reservedRowIdStart;
+
+		return list_make1(modification);
 	}
 
 	/*
@@ -293,31 +293,22 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	 * files from in-progress
 	 */
 	if (isPrefix && deferDeletion)
-		ReplaceInProgressPrefixPathWithFullPaths(dataFilePrefix, dataFiles);
+		ReplaceInProgressPrefixPathWithFullPaths(dataFilePrefix, GetDataFilePathsFromStatsList(dataFileStats));
 
 	/* build a DataFileModification for each new data file */
 	List	   *modifications = NIL;
-	ListCell   *dataFileCell = NULL;
+	ListCell   *dataFileStatsCell = NULL;
 
-	foreach(dataFileCell, dataFiles)
+	foreach(dataFileStatsCell, dataFileStats)
 	{
-		char	   *dataFilePath = lfirst(dataFileCell);
-
-		/*
-		 * If the file is split, we don't know the per-file row count, so we
-		 * count the rows. This is likely to be quite fast because it can be
-		 * answered from metadata and the file is still in cache.
-		 */
-		if (list_length(dataFiles) > 1)
-			rowCount = GetRemoteParquetFileRowCount(dataFilePath);
+		DataFileStats *stats = lfirst(dataFileStatsCell);
 
 		DataFileModification *modification = palloc0(sizeof(DataFileModification));
-
 		modification->type = ADD_DATA_FILE;
-		modification->insertFile = dataFilePath;
-		modification->insertedRowCount = rowCount;
+		modification->insertFile = stats->dataFilePath;
+		modification->insertedRowCount = stats->rowCount;
 		modification->reservedRowIdStart = reservedRowIdStart;
-		modification->fileStats = GetDataFileStatsForFilePath(dataFileStats, dataFilePath);
+		modification->fileStats = stats;
 
 		modifications = lappend(modifications, modification);
 	}
@@ -327,20 +318,25 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	return modifications;
 }
 
-static DataFileStats *
-GetDataFileStatsForFilePath(List *dataFileStats, char *filePath)
+
+/*
+ * GetDataFileNamesFromStatsList extracts the data file paths from the given
+ * DataFileStats list.
+ */
+static List *
+GetDataFilePathsFromStatsList(List *dataFileStats)
 {
+	List	   *dataFiles = NIL;
 	ListCell   *cell = NULL;
 
 	foreach(cell, dataFileStats)
 	{
 		DataFileStats *stats = lfirst(cell);
 
-		if (strcmp(stats->dataFilePath, filePath) == 0)
-			return stats;
+		dataFiles = lappend(dataFiles, stats->dataFilePath);
 	}
 
-	return NULL;
+	return dataFiles;
 }
 
 

From dec8b2d0919f6e7f925d4f9079cc816fbf179bc5 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 5 Jan 2026 17:55:02 +0300
Subject: [PATCH 18/46] Minor improvements

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/include/pg_lake/pgduck/write_data.h  |  1 -
 pg_lake_engine/src/init.c                           | 13 -------------
 pg_lake_engine/src/pgduck/write_data.c              |  3 +--
 .../include/pg_lake/iceberg/iceberg_field.h         |  2 ++
 pg_lake_iceberg/src/iceberg/iceberg_field.c         |  2 ++
 pg_lake_iceberg/src/init.c                          | 13 +++++++++++++
 6 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index aae7bd94..d68f67b7 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -35,7 +35,6 @@ typedef enum ParquetVersion
 
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
-extern PGDLLEXPORT bool DeprecatedEnableStatsCollectionForNestedTypes;
 
 extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
 										 TupleDesc tupleDesc,
diff --git a/pg_lake_engine/src/init.c b/pg_lake_engine/src/init.c
index 5da5933e..c0c8f587 100644
--- a/pg_lake_engine/src/init.c
+++ b/pg_lake_engine/src/init.c
@@ -168,19 +168,6 @@ _PG_init(void)
 							GUC_UNIT_S | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
 							NULL, NULL, NULL);
 
-	DefineCustomBoolVariable(
-							 "pg_lake_iceberg.enable_stats_collection_for_nested_types",
-							 gettext_noop("When set to true, stats collection is enabled for nested types."
-										  "We currently do not support pruning for nested types, but you can "
-										  "still get into stats problems with nested types due to parsing "
-										  "discrepancies between Postgres and DuckDB."),
-							 NULL,
-							 &DeprecatedEnableStatsCollectionForNestedTypes,
-							 false,
-							 PGC_SUSET,
-							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
-							 NULL, NULL, NULL);
-
 	if (QueryEngineEnabled)
 	{
 		InitializePgLakeEngineIdCache();
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 881c8cde..b5a6cfa4 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -67,7 +67,6 @@ static DuckDBTypeInfo VARCHAR_TYPE =
 
 int			TargetRowGroupSizeMB = DEFAULT_TARGET_ROW_GROUP_SIZE_MB;
 int			DefaultParquetVersion = PARQUET_VERSION_V1;
-bool		DeprecatedEnableStatsCollectionForNestedTypes = false;
 
 /*
  * ConvertCSVFileTo copies and converts a CSV file at source path to
@@ -613,7 +612,7 @@ UnescapeDoubleQuotes(const char *s)
 		return out;
 	}
 
-	return pstrdup(s);
+	return s;
 }
 
 
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
index caf74ff9..a856a06f 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
@@ -23,6 +23,8 @@
 #include "pg_lake/pgduck/type.h"
 #include "pg_lake/parquet/leaf_field.h"
 
+extern bool DeprecatedEnableStatsCollectionForNestedTypes;
+
 extern PGDLLEXPORT PGType IcebergFieldToPostgresType(Field * field);
 extern PGDLLEXPORT Field * PostgresTypeToIcebergField(PGType pgType,
 													  bool forAddColumn,
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 346a9f7d..0b1d6cc3 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -60,6 +60,8 @@
 #include "utils/rel.h"
 #include "utils/typcache.h"
 
+bool		DeprecatedEnableStatsCollectionForNestedTypes = false;
+
 typedef enum IcebergType
 {
 	ICEBERG_TYPE_INVALID,
diff --git a/pg_lake_iceberg/src/init.c b/pg_lake_iceberg/src/init.c
index 9fab9f9c..5bd95bf6 100644
--- a/pg_lake_iceberg/src/init.c
+++ b/pg_lake_iceberg/src/init.c
@@ -126,6 +126,19 @@ _PG_init(void)
 							PGC_SIGHUP, GUC_UNIT_MS,
 							NULL, NULL, NULL);
 
+	DefineCustomBoolVariable(
+							 "pg_lake_iceberg.enable_stats_collection_for_nested_types",
+							 gettext_noop("When set to true, stats collection is enabled for nested types."
+										  "We currently do not support pruning for nested types, but you can "
+										  "still get into stats problems with nested types due to parsing "
+										  "discrepancies between Postgres and DuckDB."),
+							 NULL,
+							 &DeprecatedEnableStatsCollectionForNestedTypes,
+							 false,
+							 PGC_SUSET,
+							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
+							 NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 							 "pg_lake_iceberg.http_client_trace_traffic",
 							 gettext_noop("When set to true, HTTP client logging is enabled."),

From 1d8ad4c40491eb44dabee1112bd7cb2c40eb7e2c Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Mon, 5 Jan 2026 18:37:12 +0300
Subject: [PATCH 19/46] Rename FindGeneratedDataFiles to
 GetNewFileOpsFromFileStats

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 169bc871..fef3e5ee 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -100,7 +100,7 @@ static List *ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 static List *ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount,
 							 int64 liveRowCount, char *deleteFile, int64 deletedRowCount);
 static List *GetDataFilePathsFromStatsList(List *dataFileStats);
-static List *FindGeneratedDataFiles(Oid relationId, List *dataFileStats,
+static List *GetNewFileOpsFromFileStats(Oid relationId, List *dataFileStats,
 									int32 partitionSpecId, Partition * partition, int64 rowCount,
 									bool isVerbose, List **newFiles);
 static bool ShouldRewriteAfterDeletions(int64 sourceRowCount, uint64 totalDeletedRowCount);
@@ -341,11 +341,12 @@ GetDataFilePathsFromStatsList(List *dataFileStats)
 
 
 /*
- * FindGeneratedDataFiles gets the list of newly written data files (could
- * be multiple when file_size_bytes is specified) and adds them to the metadata.
+ * GetNewFileOpsFromFileStats gets the list of newly written data files (could
+ * be multiple when file_size_bytes is specified) with their file stats
+ * and adds them to the metadata operations list to be returned.
  */
 static List *
-FindGeneratedDataFiles(Oid relationId, List *dataFileStats, int32 partitionSpecId, Partition * partition,
+GetNewFileOpsFromFileStats(Oid relationId, List *dataFileStats, int32 partitionSpecId, Partition * partition,
 					   int64 rowCount, bool isVerbose, List **newFiles)
 {
 	*newFiles = NIL;
@@ -1006,7 +1007,7 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 
 	/* find which files were generated */
 	List	   *newFiles = NIL;
-	List	   *newFileOps = FindGeneratedDataFiles(relationId, dataFileStats,
+	List	   *newFileOps = GetNewFileOpsFromFileStats(relationId, dataFileStats,
 													partitionSpecId, partition,
 													rowCount,
 													isVerbose, &newFiles);

From 4436d09afbe23704c24d98521e3b8be4f21a04d8 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 6 Jan 2026 13:06:29 +0300
Subject: [PATCH 20/46] Add struct ColumnStatsCollector

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_copy/src/copy/copy.c                  |  2 +-
 .../include/pg_lake/pgduck/write_data.h       | 12 +++++++----
 pg_lake_engine/src/pgduck/write_data.c        | 13 +++++-------
 pg_lake_table/src/fdw/writable_table.c        | 20 ++++++++++++++-----
 4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/pg_lake_copy/src/copy/copy.c b/pg_lake_copy/src/copy/copy.c
index 6e433339..99a073e3 100644
--- a/pg_lake_copy/src/copy/copy.c
+++ b/pg_lake_copy/src/copy/copy.c
@@ -916,7 +916,7 @@ ProcessPgLakeCopyTo(CopyStmt *copyStmt, ParseState *pstate, Relation relation,
 	 */
 	ConvertCSVFileTo(tempCSVPath, tupleDesc, maximumLineLength,
 					 destinationPath, destinationFormat, destinationCompression,
-					 copyStmt->options, schema, NULL, NULL);
+					 copyStmt->options, schema, NULL);
 
 	if (IsCopyToStdout(copyStmt))
 	{
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index d68f67b7..8c463fa0 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -33,6 +33,12 @@ typedef enum ParquetVersion
 	PARQUET_VERSION_V2 = 2
 } ParquetVersion;
 
+typedef struct ColumnStatsCollector
+{
+	List *leafFields;
+	List **dataFileStats;
+} ColumnStatsCollector;
+
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
 
@@ -44,8 +50,7 @@ extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
 										 CopyDataCompression destinationCompression,
 										 List *formatOptions,
 										 DataFileSchema * schema,
-										 List *leafFields,
-										 List **dataFileStats);
+										 ColumnStatsCollector * statsCollector);
 extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
 											char *destinationPath,
 											CopyDataFormat destinationFormat,
@@ -54,8 +59,7 @@ extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
 											bool queryHasRowId,
 											DataFileSchema * schema,
 											TupleDesc queryTupleDesc,
-											List *leafFields,
-											List **dataFileStats);
+											ColumnStatsCollector * statsCollector);
 extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
 extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
 														  List *leafFields,
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index b5a6cfa4..0bd7dd52 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -81,8 +81,7 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 				 CopyDataCompression destinationCompression,
 				 List *formatOptions,
 				 DataFileSchema * schema,
-				 List *leafFields,
-				 List **dataFileStats)
+				 ColumnStatsCollector * statsCollector)
 {
 	StringInfoData command;
 
@@ -153,8 +152,7 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 					   queryHasRowIds,
 					   schema,
 					   csvTupleDesc,
-					   leafFields,
-					   dataFileStats);
+					   statsCollector);
 }
 
 
@@ -172,8 +170,7 @@ WriteQueryResultTo(char *query,
 				   bool queryHasRowId,
 				   DataFileSchema * schema,
 				   TupleDesc queryTupleDesc,
-				   List *leafFields,
-				   List **dataFileStats)
+				   ColumnStatsCollector * statsCollector)
 {
 	StringInfoData command;
 
@@ -422,10 +419,10 @@ WriteQueryResultTo(char *query,
 		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
 		CheckPGDuckResult(pgDuckConn, result);
 
-		if (destinationFormat == DATA_FORMAT_PARQUET && dataFileStats != NULL)
+		if (destinationFormat == DATA_FORMAT_PARQUET && statsCollector != NULL && statsCollector->dataFileStats != NULL)
 		{
 			/* DuckDB returns COPY 0 when return_stats is used. */
-			*dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
+			*statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
 		}
 		else
 		{
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index fef3e5ee..98928f04 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -260,6 +260,12 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	InsertInProgressFileRecordExtended(dataFilePrefix, isPrefix, deferDeletion);
 
 	List	   *dataFileStats = NIL;
+	List	   *leafFields = GetLeafFieldsForTable(relationId);
+	ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
+	{
+		.leafFields = leafFields,
+		.dataFileStats = &dataFileStats
+	};
 
 	/* convert insert file to a new file in table format */
 	ConvertCSVFileTo(insertCSV,
@@ -270,8 +276,7 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 					 compression,
 					 options,
 					 schema,
-					 GetLeafFieldsForTable(relationId),
-					 &dataFileStats);
+					 &columnStatsCollector);
 
 	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
 
@@ -579,7 +584,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			/* write the deletion file */
 			ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
-							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NULL, NULL);
+							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NULL);
 
 			ereport(WriteLogLevel, (errmsg("adding deletion file %s with " INT64_FORMAT " rows ",
 										   deletionFilePath, deletedRowCount)));
@@ -980,6 +985,12 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 
 	/* perform compaction */
 	List	   *dataFileStats = NIL;
+	List	   *leafFields = GetLeafFieldsForTable(relationId);
+	ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
+	{
+		.leafFields = leafFields,
+		.dataFileStats = &dataFileStats
+	};
 	int64		rowCount =
 		WriteQueryResultTo(readQuery,
 						   newDataFilePath,
@@ -989,8 +1000,7 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 						   queryHasRowId,
 						   schema,
 						   queryTupleDesc,
-						   GetLeafFieldsForTable(relationId),
-						   &dataFileStats);
+						   &columnStatsCollector);
 
 	if (rowCount == 0)
 	{

From 96e2162ccb89763ebeeb133996d89c37ef0ac416 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 6 Jan 2026 13:23:05 +0300
Subject: [PATCH 21/46] Rewrite GetDataFileColumnStatsList, add helpers and
 logs

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 79 ++++++++++++++++++--------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 0bd7dd52..ffc503ec 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -58,6 +58,8 @@ static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, Lis
 static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
 static char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
+static LeafField *FindLeafFieldWithId(List *leafFields, int fieldId);
+static int FindIndexInStringList(List *names, const char *targetName);
 static bool ShouldSkipStatisticsForField(LeafField * leafField);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
@@ -718,37 +720,15 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 		DataFileSchemaField *field = &schema->fields[fieldIndex];
 		const char *fieldName = field->name;
 		int			fieldId = field->id;
-		int			nameIndexFound = -1;
-
-		for (int nameIndex = 0; nameIndex < list_length(names); nameIndex++)
-		{
-			char	   *name = list_nth(names, nameIndex);
-
-			if (strcmp(name, fieldName) == 0)
-			{
-				nameIndexFound = nameIndex;
-				break;
-			}
-		}
 
+		int			nameIndexFound = FindIndexInStringList(names, fieldName);
 		if (nameIndexFound == -1)
 		{
+			ereport(DEBUG3, (errmsg("field with name %s not found in stats output, skipping", fieldName)));
 			continue;
 		}
 
-		LeafField  *leafField = NULL;
-		ListCell   *leafCell = NULL;
-
-		foreach(leafCell, leafFields)
-		{
-			LeafField  *lf = lfirst(leafCell);
-
-			if (lf->fieldId == fieldId && !ShouldSkipStatisticsForField(lf))
-			{
-				leafField = lf;
-				break;
-			}
-		}
+		LeafField  *leafField = FindLeafFieldWithId(leafFields, fieldId);
 
 		if (leafField != NULL)
 		{
@@ -768,6 +748,55 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 }
 
 
+/*
+ * FindLeafFieldWithId finds the leaf field with given id in a list of leaf fields.
+ * Returns NULL if not found.
+ */
+static LeafField *
+FindLeafFieldWithId(List *leafFields, int fieldId)
+{
+	ListCell   *cell = NULL;
+
+	foreach(cell, leafFields)
+	{
+		LeafField  *lf = lfirst(cell);
+
+		if (lf->fieldId == fieldId)
+		{
+			if (ShouldSkipStatisticsForField(lf))
+			{
+				ereport(DEBUG3, (errmsg("skipping statistics for field id %d", fieldId)));
+				return NULL;
+			}
+
+			return lf;
+		}
+	}
+
+	ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
+	return NULL;
+}
+
+
+/*
+ * FindIndexInStringList finds the index of targetName in names list.
+ * Returns -1 if not found.
+ */
+static int
+FindIndexInStringList(List *names, const char *targetName)
+{
+	for(int index = 0; index < list_length(names); index++)
+	{
+		if (strcmp(list_nth(names, index), targetName) == 0)
+		{
+			return index;
+		}
+	}
+
+	return -1;
+}
+
+
 /*
  * ShouldSkipStatisticsForField determines whether statistics should be
  * skipped for the given leaf field.

From 6fde32d50f8b97f54ca2a0b058edfa704c636168 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 6 Jan 2026 14:42:28 +0300
Subject: [PATCH 22/46] Use ColumnStatsCollector in PerformDeleteFromParquet

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/pgduck/delete_data.h          |  4 ++--
 pg_lake_engine/src/pgduck/delete_data.c           |  8 ++------
 pg_lake_table/src/fdw/writable_table.c            | 15 +++++++++++----
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index 9ec6b834..102c11f3 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -22,6 +22,7 @@
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/pgduck/read_data.h"
+#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/data_file/data_file_stats.h"
 
 extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
@@ -31,5 +32,4 @@ extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
 												 CopyDataCompression destinationCompression,
 												 DataFileSchema * schema,
 												 ReadDataStats * stats,
-												 List *leafFields,
-												 DataFileStats * *newFileStats);
+												 ColumnStatsCollector *statsCollector);
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index aff9e075..6b176fb0 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -55,8 +55,7 @@ PerformDeleteFromParquet(char *sourcePath,
 						 CopyDataCompression destinationCompression,
 						 DataFileSchema * schema,
 						 ReadDataStats * stats,
-						 List *leafFields,
-						 DataFileStats * *newFileStats)
+						 ColumnStatsCollector *statsCollector)
 {
 	const char *remainderQuery =
 		DeleteFromParquetQuery(sourcePath, positionDeleteFiles, deletionFilePath, schema, stats);
@@ -107,10 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 		CheckPGDuckResult(pgDuckConn, result);
 
 		int64 rowsAffected;
-		List	   *dataFileStats = GetDataFileStatsListFromPGResult(result, leafFields, schema, &rowsAffected);
-
-		Assert(dataFileStats != NIL);
-		*newFileStats = linitial(dataFileStats);
+		statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
 
 		PQclear(result);
 	}
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 98928f04..c6675ed5 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -531,13 +531,19 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			uint64		existingDeletedRowCount = sourceRowCount - liveRowCount;
 
 			ReadDataStats stats = {sourceRowCount, existingDeletedRowCount};
-			DataFileStats *newFileStats = NULL;
 
+			List	   *dataFileStats = NIL;
+			List	   *leafFields = GetLeafFieldsForTable(relationId);
+			ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
+			{
+				.leafFields = leafFields,
+				.dataFileStats = &dataFileStats
+			};
 			PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
 									 deleteFile, newDataFilePath, compression,
-									 schema, &stats, GetLeafFieldsForTable(relationId), &newFileStats);
+									 schema, &stats, &columnStatsCollector);
 
-			ApplyColumnStatsModeForAllFileStats(relationId, list_make1(newFileStats));
+			ApplyColumnStatsModeForAllFileStats(relationId, columnStatsCollector.dataFileStats);
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 
@@ -553,9 +559,10 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
+			Assert(columnStatsCollector.dataFileStats != NIL);
 			/* store the new file in the metadata */
 			TableMetadataOperation *addOperation =
-				AddDataFileOperation(newDataFilePath, CONTENT_DATA, newFileStats, partition, partitionSpecId);
+				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(columnStatsCollector.dataFileStats), partition, partitionSpecId);
 
 			metadataOperations = lappend(metadataOperations, addOperation);
 		}

From 1b1bc05151d67c5e09611050ed741bc46605fe5a Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Tue, 6 Jan 2026 15:29:10 +0300
Subject: [PATCH 23/46] Minor rename variable

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index ffc503ec..42120c39 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -721,8 +721,8 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 		const char *fieldName = field->name;
 		int			fieldId = field->id;
 
-		int			nameIndexFound = FindIndexInStringList(names, fieldName);
-		if (nameIndexFound == -1)
+		int			nameIndex = FindIndexInStringList(names, fieldName);
+		if (nameIndex == -1)
 		{
 			ereport(DEBUG3, (errmsg("field with name %s not found in stats output, skipping", fieldName)));
 			continue;
@@ -732,8 +732,8 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 
 		if (leafField != NULL)
 		{
-			char	   *minStr = list_nth(mins, nameIndexFound);
-			char	   *maxStr = list_nth(maxs, nameIndexFound);
+			char	   *minStr = list_nth(mins, nameIndex);
+			char	   *maxStr = list_nth(maxs, nameIndex);
 
 			DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
 

From 8c0e688d8408f2f19129366677a4c6671cfc3ad1 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 12:51:49 +0300
Subject: [PATCH 24/46] Move FindLeafField to engine

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/pgduck/write_data.h       |  1 +
 pg_lake_engine/src/pgduck/write_data.c        | 55 +++++++++----------
 pg_lake_iceberg/src/iceberg/iceberg_field.c   | 23 --------
 3 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 8c463fa0..60bda0f6 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -65,3 +65,4 @@ extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
 														  List *leafFields,
 														  DataFileSchema * schema,
 														  int64 *totalRowCount);
+extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 42120c39..4463b83f 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -58,7 +58,6 @@ static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, Lis
 static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
 static char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
-static LeafField *FindLeafFieldWithId(List *leafFields, int fieldId);
 static int FindIndexInStringList(List *names, const char *targetName);
 static bool ShouldSkipStatisticsForField(LeafField * leafField);
 
@@ -728,20 +727,28 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 			continue;
 		}
 
-		LeafField  *leafField = FindLeafFieldWithId(leafFields, fieldId);
+		LeafField  *leafField = FindLeafField(leafFields, fieldId);
 
-		if (leafField != NULL)
+		if (leafField == NULL)
 		{
-			char	   *minStr = list_nth(mins, nameIndex);
-			char	   *maxStr = list_nth(maxs, nameIndex);
+			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
+			continue;
+		}
+		else if(ShouldSkipStatisticsForField(leafField))
+		{
+			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
+			continue;
+		}
 
-			DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
+		char	   *minStr = list_nth(mins, nameIndex);
+		char	   *maxStr = list_nth(maxs, nameIndex);
 
-			colStats->leafField = *leafField;
-			colStats->lowerBoundText = pstrdup(minStr);
-			colStats->upperBoundText = pstrdup(maxStr);
-			columnStatsList = lappend(columnStatsList, colStats);
-		}
+		DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
+
+		colStats->leafField = *leafField;
+		colStats->lowerBoundText = pstrdup(minStr);
+		colStats->upperBoundText = pstrdup(maxStr);
+		columnStatsList = lappend(columnStatsList, colStats);
 	}
 
 	return columnStatsList;
@@ -749,31 +756,19 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 
 
 /*
- * FindLeafFieldWithId finds the leaf field with given id in a list of leaf fields.
- * Returns NULL if not found.
- */
-static LeafField *
-FindLeafFieldWithId(List *leafFields, int fieldId)
+* FindLeafField finds the leaf field with the given fieldId.
+*/
+LeafField *
+FindLeafField(List *leafFieldList, int fieldId)
 {
-	ListCell   *cell = NULL;
-
-	foreach(cell, leafFields)
+	foreach_ptr(LeafField, leafField, leafFieldList)
 	{
-		LeafField  *lf = lfirst(cell);
-
-		if (lf->fieldId == fieldId)
+		if (leafField->fieldId == fieldId)
 		{
-			if (ShouldSkipStatisticsForField(lf))
-			{
-				ereport(DEBUG3, (errmsg("skipping statistics for field id %d", fieldId)));
-				return NULL;
-			}
-
-			return lf;
+			return leafField;
 		}
 	}
 
-	ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
 	return NULL;
 }
 
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 0b1d6cc3..8014ed73 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -178,7 +178,6 @@ static char *PostgresBaseTypeIdToIcebergTypeName(PGType pgType);
 static IcebergTypeInfo * GetIcebergTypeInfoFromTypeName(const char *typeName);
 static const char *GetIcebergJsonSerializedConstDefaultIfExists(const char *attrName, Field * field, Node *defaultExpr);
 static List *FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path);
-static LeafField * FindLeafField(List *leafFieldList, int fieldId);
 static char *PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList);
 static char *SerializeTextArrayTypeToPgDuck(ArrayType *array);
 static ArrayType *ReadArrayFromText(char *arrayText);
@@ -1095,28 +1094,6 @@ FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path)
 }
 
 
-/*
-* FindLeafField finds the leaf field with the given fieldId.
-*/
-static LeafField *
-FindLeafField(List *leafFieldList, int fieldId)
-{
-	ListCell   *lc;
-
-	foreach(lc, leafFieldList)
-	{
-		LeafField  *leafField = lfirst(lc);
-
-		if (leafField->fieldId == fieldId)
-		{
-			return leafField;
-		}
-	}
-
-	return NULL;
-}
-
-
 /*
 * For the given rowGroupStatList, prepare the query to get the min and max values
 * for each field. In the end, we will have a query like:

From 994cc33cd8777bce43bc4550da37571d004129ad Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 13:05:48 +0300
Subject: [PATCH 25/46] Move ShouldSkipStatistics to engine

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/pgduck/write_data.h       |  2 +
 pg_lake_engine/src/pgduck/write_data.c        | 72 ++++++++++++----
 .../include/pg_lake/iceberg/iceberg_field.h   |  1 -
 pg_lake_iceberg/src/iceberg/iceberg_field.c   | 82 -------------------
 .../src/iceberg/iceberg_type_binary_serde.c   |  1 +
 .../src/iceberg/iceberg_type_json_serde.c     |  1 +
 .../src/test/test_iceberg_binary_serde.c      |  1 +
 7 files changed, 60 insertions(+), 100 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 60bda0f6..6f548516 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -66,3 +66,5 @@ extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
 														  DataFileSchema * schema,
 														  int64 *totalRowCount);
 extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
+extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
+extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 4463b83f..0988df98 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -59,7 +59,6 @@ static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names,
 static char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
 static int FindIndexInStringList(List *names, const char *targetName);
-static bool ShouldSkipStatisticsForField(LeafField * leafField);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
 {
@@ -734,7 +733,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
 			continue;
 		}
-		else if(ShouldSkipStatisticsForField(leafField))
+		else if(ShouldSkipStatistics(leafField))
 		{
 			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
 			continue;
@@ -793,39 +792,53 @@ FindIndexInStringList(List *names, const char *targetName)
 
 
 /*
- * ShouldSkipStatisticsForField determines whether statistics should be
- * skipped for the given leaf field.
- */
-static bool
-ShouldSkipStatisticsForField(LeafField * leafField)
+* ShouldSkipStatistics returns true if the statistics should be skipped for the
+* given leaf field.
+*/
+bool
+ShouldSkipStatistics(LeafField * leafField)
 {
 	Field	   *field = leafField->field;
 	PGType		pgType = leafField->pgType;
 
 	Oid			pgTypeOid = pgType.postgresTypeOid;
 
-	if (IsGeometryTypeId(pgType.postgresTypeOid))
-	{
-		return true;
-	}
-	else if (strcmp(field->field.scalar.typeName, "string") == 0 &&
-			 pgType.postgresTypeOid != TEXTOID &&
-			 pgTypeOid != VARCHAROID &&
-			 pgTypeOid != BPCHAROID &&
-			 pgTypeOid != CHAROID)
+	if (PGTypeRequiresConversionToIcebergString(field, pgType))
 	{
-		return true;
+		if (!(pgTypeOid == VARCHAROID || pgTypeOid == BPCHAROID ||
+			  pgTypeOid == CHAROID))
+		{
+			/*
+			 * Although there are no direct equivalents of these types on
+			 * Iceberg, it is pretty safe to support pruning on these types.
+			 */
+			return true;
+		}
 	}
 	else if (pgTypeOid == BYTEAOID)
 	{
+		/*
+		 * parquet_metadata function sometimes returns a varchar repr of blob,
+		 * which cannot be properly deserialized by Postgres. (when there is
+		 * "\" or nonprintable chars in the blob ) See issue Old repo:
+		 * issues/957
+		 */
 		return true;
 	}
 	else if (pgTypeOid == UUIDOID)
 	{
+		/*
+		 * DuckDB does not keep statistics for UUID type. We should skip
+		 * statistics for UUID type.
+		 */
 		return true;
 	}
 	else if (leafField->level != 1)
 	{
+		/*
+		 * We currently do not support pruning on array, map, and composite
+		 * types. So there's no need to collect stats for them.
+		 */
 		return true;
 	}
 
@@ -833,6 +846,31 @@ ShouldSkipStatisticsForField(LeafField * leafField)
 }
 
 
+/*
+ * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
+ * requires conversion to Iceberg string.
+ * Some of the Postgres types cannot be directly mapped to an Iceberg type.
+ * e.g. custom types like hstore
+ */
+bool
+PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
+{
+	/*
+	 * We treat geometry as binary within the Iceberg schema, which is encoded
+	 * as a hexadecimal string according to the spec. As it happens, the
+	 * Postgres output function of geometry produces a hexadecimal WKB string,
+	 * so we can use the regular text output function to convert to an Iceberg
+	 * value.
+	 */
+	if (IsGeometryTypeId(pgType.postgresTypeOid))
+	{
+		return true;
+	}
+
+	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
+}
+
+
 /*
  * TupleDescToProjectionList converts a PostgreSQL tuple descriptor to
  * projection list in string form that can be used for writes.
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
index a856a06f..e0599c9f 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
@@ -31,7 +31,6 @@ extern PGDLLEXPORT Field * PostgresTypeToIcebergField(PGType pgType,
 													  int *subFieldIndex);
 extern PGDLLEXPORT void EnsureIcebergField(Field * field);
 extern PGDLLEXPORT const char *IcebergTypeNameToDuckdbTypeName(const char *icebergTypeName);
-extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
 extern PGDLLEXPORT DataFileSchema * CreatePositionDeleteDataFileSchema(void);
 extern PGDLLEXPORT const char *GetIcebergJsonSerializedDefaultExpr(TupleDesc tupdesc, AttrNumber attnum,
 																   FieldStructElement * structElementField);
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 8014ed73..e16e208a 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -182,7 +182,6 @@ static char *PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList);
 static char *SerializeTextArrayTypeToPgDuck(ArrayType *array);
 static ArrayType *ReadArrayFromText(char *arrayText);
 static List *GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatsList);
-static bool ShouldSkipStatistics(LeafField * leafField);
 
 
 /*
@@ -440,33 +439,6 @@ IcebergFieldToPostgresType(Field * field)
 }
 
 
-/*
- * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
- * requires conversion to Iceberg string.
- * Some of the Postgres types cannot be directly mapped to an Iceberg type.
- * e.g. custom types like hstore
- */
-bool
-PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
-{
-	EnsureIcebergField(field);
-
-	/*
-	 * We treat geometry as binary within the Iceberg schema, which is encoded
-	 * as a hexadecimal string according to the spec. As it happens, the
-	 * Postgres output function of geometry produces a hexadecimal WKB string,
-	 * so we can use the regular text output function to convert to an Iceberg
-	 * value.
-	 */
-	if (IsGeometryTypeId(pgType.postgresTypeOid))
-	{
-		return true;
-	}
-
-	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
-}
-
-
 /*
  * GetDuckDBTypeNameFromIcebergTypeName returns corresponding DuckDB type for
  * given Iceberg type.
@@ -1287,57 +1259,3 @@ GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatList)
 	return columnStatsList;
 }
 
-
-/*
-* ShouldSkipStatistics returns true if the statistics should be skipped for the
-* given leaf field.
-*/
-static bool
-ShouldSkipStatistics(LeafField * leafField)
-{
-	Field	   *field = leafField->field;
-	PGType		pgType = leafField->pgType;
-
-	Oid			pgTypeOid = pgType.postgresTypeOid;
-
-	if (PGTypeRequiresConversionToIcebergString(field, pgType))
-	{
-		if (!(pgTypeOid == VARCHAROID || pgTypeOid == BPCHAROID ||
-			  pgTypeOid == CHAROID))
-		{
-			/*
-			 * Although there are no direct equivalents of these types on
-			 * Iceberg, it is pretty safe to support pruning on these types.
-			 */
-			return true;
-		}
-	}
-	else if (pgTypeOid == BYTEAOID)
-	{
-		/*
-		 * parquet_metadata function sometimes returns a varchar repr of blob,
-		 * which cannot be properly deserialized by Postgres. (when there is
-		 * "\" or nonprintable chars in the blob ) See issue Old repo:
-		 * issues/957
-		 */
-		return true;
-	}
-	else if (pgTypeOid == UUIDOID)
-	{
-		/*
-		 * DuckDB does not keep statistics for UUID type. We should skip
-		 * statistics for UUID type.
-		 */
-		return true;
-	}
-	else if (leafField->level != 1)
-	{
-		/*
-		 * We currently do not support pruning on array, map, and composite
-		 * types. So there's no need to collect stats for them.
-		 */
-		return true;
-	}
-
-	return false;
-}
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c b/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
index 73111828..a3d5755e 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
@@ -26,6 +26,7 @@
 #include "pg_lake/iceberg/iceberg_type_binary_serde.h"
 #include "pg_lake/iceberg/iceberg_type_numeric_binary_serde.h"
 #include "pg_lake/iceberg/utils.h"
+#include "pg_lake/pgduck/write_data.h"
 
 #include "port/pg_bswap.h"
 #include "utils/builtins.h"
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c b/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
index e2c0b951..5110f08c 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
@@ -27,6 +27,7 @@
 #include "pg_lake/iceberg/iceberg_type_json_serde.h"
 #include "pg_lake/json/json_utils.h"
 #include "pg_lake/pgduck/map.h"
+#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/spi_helpers.h"
 
 #include "access/tupdesc.h"
diff --git a/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c b/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
index 616d61f5..12cd559f 100644
--- a/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
+++ b/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
@@ -24,6 +24,7 @@
 #include "pg_lake/iceberg/iceberg_type_binary_serde.h"
 #include "pg_lake/parquet/leaf_field.h"
 #include "pg_lake/pgduck/type.h"
+#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/numeric.h"
 
 #include "utils/builtins.h"

From 78c536aec46386f703ab8ce732d6f79e65393b86 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 13:23:13 +0300
Subject: [PATCH 26/46] Add leaf_field.c

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/parquet/leaf_field.h      |  1 +
 .../include/pg_lake/pgduck/write_data.h       |  1 +
 pg_lake_engine/src/data_file/data_files.c     | 21 +---------
 pg_lake_engine/src/parquet/leaf_field.c       | 41 +++++++++++++++++++
 4 files changed, 44 insertions(+), 20 deletions(-)
 create mode 100644 pg_lake_engine/src/parquet/leaf_field.c

diff --git a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
index e185f397..624a41e7 100644
--- a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
+++ b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
@@ -51,6 +51,7 @@ typedef struct LeafField
 
 extern PGDLLEXPORT int LeafFieldCompare(const ListCell *a, const ListCell *b);
 extern PGDLLEXPORT bool SchemaFieldsEquivalent(DataFileSchemaField * fieldA, DataFileSchemaField * fieldB);
+extern PGDLLEXPORT LeafField DeepCopyLeafField(const LeafField * leafField);
 #if PG_VERSION_NUM < 170000
 extern PGDLLEXPORT int pg_cmp_s32(int32 a, int32 b);
 #endif
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 6f548516..9b0b945d 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -21,6 +21,7 @@
 #include "libpq-fe.h"
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/parquet/field.h"
+#include "pg_lake/parquet/leaf_field.h"
 #include "nodes/pg_list.h"
 
 /* pg_lake_table.target_row_group_size_mb */
diff --git a/pg_lake_engine/src/data_file/data_files.c b/pg_lake_engine/src/data_file/data_files.c
index 4ddbdd75..526dd11f 100644
--- a/pg_lake_engine/src/data_file/data_files.c
+++ b/pg_lake_engine/src/data_file/data_files.c
@@ -17,10 +17,9 @@
 
 #include "postgres.h"
 #include "pg_lake/data_file/data_files.h"
+#include "pg_lake/parquet/leaf_field.h"
 #include "pg_lake/util/string_utils.h"
 
-static LeafField DeepCopyLeafField(const LeafField * leafField);
-
 /*
  * AddDataFileOperation creates a TableMetadataOperation for adding a new data
  * file.
@@ -149,21 +148,3 @@ DeepCopyDataFileStats(const DataFileStats * stats)
 
 	return copiedStats;
 }
-
-
-/*
- * DeepCopyLeafField deep copies a LeafField.
- */
-static LeafField
-DeepCopyLeafField(const LeafField * leafField)
-{
-	LeafField  *copiedLeafField = palloc0(sizeof(LeafField));
-
-	copiedLeafField->fieldId = leafField->fieldId;
-	copiedLeafField->field = DeepCopyField(leafField->field);
-	copiedLeafField->duckTypeName = pstrdup(leafField->duckTypeName);
-	copiedLeafField->level = leafField->level;
-	copiedLeafField->pgType = leafField->pgType;
-
-	return *copiedLeafField;
-}
diff --git a/pg_lake_engine/src/parquet/leaf_field.c b/pg_lake_engine/src/parquet/leaf_field.c
new file mode 100644
index 00000000..9e7e350c
--- /dev/null
+++ b/pg_lake_engine/src/parquet/leaf_field.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2025 Snowflake Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "postgres.h"
+
+#include "common/int.h"
+
+#include "pg_lake/parquet/field.h"
+#include "pg_lake/parquet/leaf_field.h"
+#include "pg_lake/util/string_utils.h"
+
+/*
+ * DeepCopyLeafField deep copies a LeafField.
+ */
+LeafField
+DeepCopyLeafField(const LeafField * leafField)
+{
+	LeafField  *copiedLeafField = palloc0(sizeof(LeafField));
+	
+	copiedLeafField->fieldId = leafField->fieldId;
+	copiedLeafField->field = DeepCopyField(leafField->field);
+	copiedLeafField->duckTypeName = pstrdup(leafField->duckTypeName);
+	copiedLeafField->level = leafField->level;
+	copiedLeafField->pgType = leafField->pgType;
+
+	return *copiedLeafField;
+}

From 40ec78560a741699fae41ee3c1deb8bac3794603 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 13:29:25 +0300
Subject: [PATCH 27/46] Comment

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/pg_lake_engine--3.0--3.1.sql | 4 +++-
 pg_lake_engine/src/pgduck/write_data.c      | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
index 31cdffb3..1c178da7 100644
--- a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
+++ b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
@@ -34,6 +34,8 @@ CREATE FUNCTION __lake__internal__nsp__.from_hex(text)
  IMMUTABLE PARALLEL SAFE STRICT
 AS 'MODULE_PATHNAME', $function$pg_lake_internal_dummy_function$function$;
 
--- Register map types, will be used for parsing DuckDB maps
+-- Register map types, will be used for parsing DuckDB maps for COPY .. (return_stats)
+-- we prefer to create in the extension script to avoid concurrent attempts to create
+-- the same map, which may throw errors 
 SELECT map_type.create('TEXT','TEXT');
 SELECT map_type.create('TEXT','map_type.key_text_val_text');
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 0988df98..1aa3306b 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -760,8 +760,11 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 LeafField *
 FindLeafField(List *leafFieldList, int fieldId)
 {
-	foreach_ptr(LeafField, leafField, leafFieldList)
+	ListCell   *cell = NULL;
+	foreach(cell, leafFieldList)
 	{
+		LeafField  *leafField = (LeafField *) lfirst(cell);
+
 		if (leafField->fieldId == fieldId)
 		{
 			return leafField;

From 181001333ef814b185acc8db488db0300d8598e5 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 14:43:33 +0300
Subject: [PATCH 28/46] fix reference stats list

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/delete_data.c | 2 +-
 pg_lake_engine/src/pgduck/write_data.c  | 6 ++++--
 pg_lake_table/src/fdw/writable_table.c  | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 6b176fb0..2bd1382a 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -106,7 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 		CheckPGDuckResult(pgDuckConn, result);
 
 		int64 rowsAffected;
-		statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
+		*(statsCollector->dataFileStats) = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
 
 		PQclear(result);
 	}
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 1aa3306b..cabff321 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -839,8 +839,10 @@ ShouldSkipStatistics(LeafField * leafField)
 	else if (leafField->level != 1)
 	{
 		/*
-		 * We currently do not support pruning on array, map, and composite
-		 * types. So there's no need to collect stats for them.
+		 * We currently do not support pruning on array, map and composite
+		 * types. So there's no need to collect stats for them. Note that
+		 * in the past we did collect, and have some tests commented out,
+		 * such as skippedtest_pg_lake_iceberg_table_complex_values.
 		 */
 		return true;
 	}
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index c6675ed5..0e01368f 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -543,7 +543,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 									 deleteFile, newDataFilePath, compression,
 									 schema, &stats, &columnStatsCollector);
 
-			ApplyColumnStatsModeForAllFileStats(relationId, columnStatsCollector.dataFileStats);
+			ApplyColumnStatsModeForAllFileStats(relationId, *(columnStatsCollector.dataFileStats));
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 
@@ -562,7 +562,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Assert(columnStatsCollector.dataFileStats != NIL);
 			/* store the new file in the metadata */
 			TableMetadataOperation *addOperation =
-				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(columnStatsCollector.dataFileStats), partition, partitionSpecId);
+				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(*(columnStatsCollector.dataFileStats)), partition, partitionSpecId);
 
 			metadataOperations = lappend(metadataOperations, addOperation);
 		}

From 27212624bf701ac38d386e6e9f075c6c1811b757 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 14:54:43 +0300
Subject: [PATCH 29/46] fixup

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/delete_data.c | 2 +-
 pg_lake_table/src/fdw/writable_table.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 2bd1382a..5b4d8fdd 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -106,7 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 		CheckPGDuckResult(pgDuckConn, result);
 
 		int64 rowsAffected;
-		*(statsCollector->dataFileStats) = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
+		*statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
 
 		PQclear(result);
 	}
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 0e01368f..103b294f 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -543,7 +543,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 									 deleteFile, newDataFilePath, compression,
 									 schema, &stats, &columnStatsCollector);
 
-			ApplyColumnStatsModeForAllFileStats(relationId, *(columnStatsCollector.dataFileStats));
+			ApplyColumnStatsModeForAllFileStats(relationId, *columnStatsCollector.dataFileStats);
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 
@@ -559,10 +559,10 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
-			Assert(columnStatsCollector.dataFileStats != NIL);
+			Assert(*columnStatsCollector.dataFileStats != NIL);
 			/* store the new file in the metadata */
 			TableMetadataOperation *addOperation =
-				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(*(columnStatsCollector.dataFileStats)), partition, partitionSpecId);
+				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(*columnStatsCollector.dataFileStats), partition, partitionSpecId);
 
 			metadataOperations = lappend(metadataOperations, addOperation);
 		}

From 076e76179669a8b453479b5406a4ac04822b168d Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 15:58:29 +0300
Subject: [PATCH 30/46] Comment

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 103b294f..2eff066f 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -559,10 +559,18 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
-			Assert(*columnStatsCollector.dataFileStats != NIL);
+			List *newFileStatsList = *columnStatsCollector.dataFileStats;
+			Assert(newFileStatsList != NIL);
+
+			/*
+			 * while deleting from parquet, we do not add file_size_bytes option to COPY command,
+			 * so we can assume that we'll have only a single file. 
+			 */
+			DataFileStats *newFileStats = linitial(newFileStatsList);
+
 			/* store the new file in the metadata */
 			TableMetadataOperation *addOperation =
-				AddDataFileOperation(newDataFilePath, CONTENT_DATA, linitial(*columnStatsCollector.dataFileStats), partition, partitionSpecId);
+				AddDataFileOperation(newDataFilePath, CONTENT_DATA, newFileStats, partition, partitionSpecId);
 
 			metadataOperations = lappend(metadataOperations, addOperation);
 		}

From 0db1368e3e4e2aa75cadfd43e21229c5f0e3eff3 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 18:41:11 +0300
Subject: [PATCH 31/46] Get rid of redundant string duplication

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index cabff321..e7e9502e 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -53,10 +53,10 @@ static DuckDBTypeInfo ChooseDuckDBEngineTypeForWrite(PGType postgresType,
 													 CopyDataFormat destinationFormat);
 static void AppendFieldIdValue(StringInfo map, Field * field, int fieldId);
 static const char *ParquetVersionToString(ParquetVersion version);
-static void ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs);
+static void ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs);
 static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs);
 static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
-static char *UnescapeDoubleQuotes(const char *s);
+static const char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
 static int FindIndexInStringList(List *names, const char *targetName);
 
@@ -575,7 +575,7 @@ ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins
  * UnescapeDoubleQuotes unescapes any doubled quotes.
  * e.g. "ab\"\"cd\"\"ee" becomes "ab\"cd\"ee"
  */
-static char *
+static const char *
 UnescapeDoubleQuotes(const char *s)
 {
 	if (s == NULL)
@@ -656,7 +656,7 @@ ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
 		 * pg_map text key is escaped for double quotes. We need to unescape
 		 * them.
 		 */
-		char	   *unescapedColName = UnescapeDoubleQuotes(colName);
+		const char *unescapedColName = UnescapeDoubleQuotes(colName);
 
 		ExtractMinMaxForColumn(colStatsDatum, unescapedColName, names, mins, maxs);
 	}
@@ -674,7 +674,7 @@ ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
  * 		}
  */
 static void
-ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, List **maxs)
+ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs)
 {
 	/*
 	 * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...}, 'name_col' =>
@@ -692,7 +692,7 @@ ParseDuckdbColumnMinMaxFromText(const char *input, List **names, List **mins, Li
 
 	getTypeInputInfo(returnStatsMapId, &typinput, &typioparam);
 
-	Datum		statsMapDatum = OidInputFunctionCall(typinput, pstrdup(input), typioparam, -1);
+	Datum		statsMapDatum = OidInputFunctionCall(typinput, input, typioparam, -1);
 
 	/*
 	 * extract min and max for each column: iterate the underlying map datum

From 5276ba3f2cfbe7574cbb6cfe9715b4bde75978c4 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 19:22:29 +0300
Subject: [PATCH 32/46] Use the collector as return type

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_copy/src/copy/copy.c                  |  2 +-
 .../include/pg_lake/pgduck/delete_data.h      | 16 ++--
 .../include/pg_lake/pgduck/write_data.h       | 47 ++++++------
 pg_lake_engine/src/pgduck/delete_data.c       | 10 ++-
 pg_lake_engine/src/pgduck/write_data.c        | 54 +++++++-------
 pg_lake_table/src/fdw/writable_table.c        | 73 +++++++------------
 6 files changed, 95 insertions(+), 107 deletions(-)

diff --git a/pg_lake_copy/src/copy/copy.c b/pg_lake_copy/src/copy/copy.c
index 99a073e3..ce4b609e 100644
--- a/pg_lake_copy/src/copy/copy.c
+++ b/pg_lake_copy/src/copy/copy.c
@@ -916,7 +916,7 @@ ProcessPgLakeCopyTo(CopyStmt *copyStmt, ParseState *pstate, Relation relation,
 	 */
 	ConvertCSVFileTo(tempCSVPath, tupleDesc, maximumLineLength,
 					 destinationPath, destinationFormat, destinationCompression,
-					 copyStmt->options, schema, NULL);
+					 copyStmt->options, schema, NIL);
 
 	if (IsCopyToStdout(copyStmt))
 	{
diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index 102c11f3..da5170e5 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -25,11 +25,11 @@
 #include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/data_file/data_file_stats.h"
 
-extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
-												 List *positionDeleteFiles,
-												 char *deletionFilePath,
-												 char *destinationPath,
-												 CopyDataCompression destinationCompression,
-												 DataFileSchema * schema,
-												 ReadDataStats * stats,
-												 ColumnStatsCollector *statsCollector);
+extern PGDLLEXPORT ColumnStatsCollector *PerformDeleteFromParquet(char *sourceDataFilePath,
+																  List *positionDeleteFiles,
+																  char *deletionFilePath,
+																  char *destinationPath,
+																  CopyDataCompression destinationCompression,
+																  DataFileSchema * schema,
+																  ReadDataStats * stats,
+																  List *leafFields);
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 9b0b945d..4cf38d41 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -36,36 +36,35 @@ typedef enum ParquetVersion
 
 typedef struct ColumnStatsCollector
 {
-	List *leafFields;
-	List **dataFileStats;
+	int64 totalRowCount;
+	List *dataFileStats;
 } ColumnStatsCollector;
 
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
 
-extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
-										 TupleDesc tupleDesc,
-										 int maxLineSize,
-										 char *destinationPath,
-										 CopyDataFormat destinationFormat,
-										 CopyDataCompression destinationCompression,
-										 List *formatOptions,
-										 DataFileSchema * schema,
-										 ColumnStatsCollector * statsCollector);
-extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
-											char *destinationPath,
-											CopyDataFormat destinationFormat,
-											CopyDataCompression destinationCompression,
-											List *formatOptions,
-											bool queryHasRowId,
-											DataFileSchema * schema,
-											TupleDesc queryTupleDesc,
-											ColumnStatsCollector * statsCollector);
-extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
-extern PGDLLEXPORT List *GetDataFileStatsListFromPGResult(PGresult *result,
-														  List *leafFields,
+extern PGDLLEXPORT ColumnStatsCollector *ConvertCSVFileTo(char *csvFilePath,
+														  TupleDesc tupleDesc,
+														  int maxLineSize,
+														  char *destinationPath,
+														  CopyDataFormat destinationFormat,
+														  CopyDataCompression destinationCompression,
+														  List *formatOptions,
 														  DataFileSchema * schema,
-														  int64 *totalRowCount);
+														  List *leafFields);
+extern PGDLLEXPORT ColumnStatsCollector *WriteQueryResultTo(char *query,
+															char *destinationPath,
+															CopyDataFormat destinationFormat,
+															CopyDataCompression destinationCompression,
+															List *formatOptions,
+															bool queryHasRowId,
+															DataFileSchema * schema,
+															TupleDesc queryTupleDesc,
+															List *leafFields);
+extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
+extern PGDLLEXPORT ColumnStatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
+																		  List *leafFields,
+																		  DataFileSchema * schema);
 extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
 extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
 extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 5b4d8fdd..ee703fb8 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -47,7 +47,7 @@ static char *DeleteFromParquetQuery(char *sourceDataFilePath,
  * PerformDeleteFromParquet applies a deletion CSV file to a Parquet file
  * and writes the new Parquet file to destinationPath.
  */
-void
+ColumnStatsCollector *
 PerformDeleteFromParquet(char *sourcePath,
 						 List *positionDeleteFiles,
 						 char *deletionFilePath,
@@ -55,7 +55,7 @@ PerformDeleteFromParquet(char *sourcePath,
 						 CopyDataCompression destinationCompression,
 						 DataFileSchema * schema,
 						 ReadDataStats * stats,
-						 ColumnStatsCollector *statsCollector)
+						 List *leafFields)
 {
 	const char *remainderQuery =
 		DeleteFromParquetQuery(sourcePath, positionDeleteFiles, deletionFilePath, schema, stats);
@@ -98,6 +98,7 @@ PerformDeleteFromParquet(char *sourcePath,
 	appendStringInfoString(&command, ")");
 
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
+	ColumnStatsCollector *result = NULL;
 
 	PG_TRY();
 	{
@@ -105,8 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 
 		CheckPGDuckResult(pgDuckConn, result);
 
-		int64 rowsAffected;
-		*statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
+		result = GetDataFileStatsListFromPGResult(result, leafFields, schema);
 
 		PQclear(result);
 	}
@@ -115,6 +115,8 @@ PerformDeleteFromParquet(char *sourcePath,
 		ReleasePGDuckConnection(pgDuckConn);
 	}
 	PG_END_TRY();
+
+	return result;
 }
 
 
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index e7e9502e..95d836c6 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -74,14 +74,14 @@ int			DefaultParquetVersion = PARQUET_VERSION_V1;
  *
  * The CSV was generated using COPY ... TO '<csvFilePath>'
  */
-void
+ColumnStatsCollector *
 ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 				 char *destinationPath,
 				 CopyDataFormat destinationFormat,
 				 CopyDataCompression destinationCompression,
 				 List *formatOptions,
 				 DataFileSchema * schema,
-				 ColumnStatsCollector * statsCollector)
+				 List *leafFields)
 {
 	StringInfoData command;
 
@@ -144,15 +144,15 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 
 	bool		queryHasRowIds = false;
 
-	WriteQueryResultTo(command.data,
-					   destinationPath,
-					   destinationFormat,
-					   destinationCompression,
-					   formatOptions,
-					   queryHasRowIds,
-					   schema,
-					   csvTupleDesc,
-					   statsCollector);
+	return WriteQueryResultTo(command.data,
+							  destinationPath,
+							  destinationFormat,
+							  destinationCompression,
+							  formatOptions,
+							  queryHasRowIds,
+							  schema,
+							  csvTupleDesc,
+							  leafFields);
 }
 
 
@@ -161,7 +161,7 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
  * destinationPath. There may be multiple files if file_size_bytes
  * is specified in formatOptions.
  */
-int64
+ColumnStatsCollector *
 WriteQueryResultTo(char *query,
 				   char *destinationPath,
 				   CopyDataFormat destinationFormat,
@@ -170,7 +170,7 @@ WriteQueryResultTo(char *query,
 				   bool queryHasRowId,
 				   DataFileSchema * schema,
 				   TupleDesc queryTupleDesc,
-				   ColumnStatsCollector * statsCollector)
+				   List *leafFields)
 {
 	StringInfoData command;
 
@@ -403,9 +403,9 @@ WriteQueryResultTo(char *query,
 	appendStringInfoString(&command, ")");
 
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
-	int64		rowsAffected = -1;
 	PGresult   *result;
 	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
+	ColumnStatsCollector *statsCollector = NULL;
 
 	PG_TRY();
 	{
@@ -419,16 +419,17 @@ WriteQueryResultTo(char *query,
 		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
 		CheckPGDuckResult(pgDuckConn, result);
 
-		if (destinationFormat == DATA_FORMAT_PARQUET && statsCollector != NULL && statsCollector->dataFileStats != NULL)
+		if (destinationFormat == DATA_FORMAT_PARQUET && leafFields != NIL)
 		{
 			/* DuckDB returns COPY 0 when return_stats is used. */
-			*statsCollector->dataFileStats = GetDataFileStatsListFromPGResult(result, statsCollector->leafFields, schema, &rowsAffected);
+			statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
 		}
 		else
 		{
 			char	   *commandTuples = PQcmdTuples(result);
-
-			rowsAffected = atol(commandTuples);
+			statsCollector = palloc0(sizeof(ColumnStatsCollector));
+			statsCollector->totalRowCount = atoll(commandTuples);
+			statsCollector->dataFileStats = NIL;
 		}
 
 		PQclear(result);
@@ -446,7 +447,7 @@ WriteQueryResultTo(char *query,
 	}
 	PG_END_TRY();
 
-	return rowsAffected;
+	return statsCollector;
 }
 
 
@@ -456,15 +457,14 @@ WriteQueryResultTo(char *query,
  *
  * It also returns the total row count via totalRowCount output parameter.
  */
-List *
-GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema, int64 *totalRowCount)
+ColumnStatsCollector *
+GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema)
 {
 	List	   *statsList = NIL;
 
 	int			resultRowCount = PQntuples(result);
 	int			resultColumnCount = PQnfields(result);
-
-	*totalRowCount = 0;
+	int64 totalRowCount = 0;
 
 	for (int resultRowIndex = 0; resultRowIndex < resultRowCount; resultRowIndex++)
 	{
@@ -491,7 +491,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 			else if (strcmp(resultColName, "count") == 0)
 			{
 				fileStats->rowCount = atoll(resultValue);
-				*totalRowCount += fileStats->rowCount;
+				totalRowCount += fileStats->rowCount;
 			}
 			else if (strcmp(resultColName, "filename") == 0)
 			{
@@ -502,7 +502,11 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 		statsList = lappend(statsList, fileStats);
 	}
 
-	return statsList;
+	ColumnStatsCollector *statsResult = palloc0(sizeof(ColumnStatsCollector));
+	statsResult->totalRowCount = totalRowCount;
+	statsResult->dataFileStats = statsList;
+
+	return statsResult;
 }
 
 
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 2eff066f..a72a5017 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -259,26 +259,21 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 
 	InsertInProgressFileRecordExtended(dataFilePrefix, isPrefix, deferDeletion);
 
-	List	   *dataFileStats = NIL;
 	List	   *leafFields = GetLeafFieldsForTable(relationId);
-	ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
-	{
-		.leafFields = leafFields,
-		.dataFileStats = &dataFileStats
-	};
 
 	/* convert insert file to a new file in table format */
-	ConvertCSVFileTo(insertCSV,
-					 tupleDescriptor,
-					 maximumLineSize,
-					 dataFilePrefix,
-					 format,
-					 compression,
-					 options,
-					 schema,
-					 &columnStatsCollector);
-
-	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
+	ColumnStatsCollector *statsCollector =
+		ConvertCSVFileTo(insertCSV,
+						 tupleDescriptor,
+						 maximumLineSize,
+						 dataFilePrefix,
+						 format,
+						 compression,
+						 options,
+						 schema,
+						 leafFields);
+
+	ApplyColumnStatsModeForAllFileStats(relationId, statsCollector->dataFileStats);
 
 	if (!splitFilesBySize)
 	{
@@ -298,13 +293,13 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	 * files from in-progress
 	 */
 	if (isPrefix && deferDeletion)
-		ReplaceInProgressPrefixPathWithFullPaths(dataFilePrefix, GetDataFilePathsFromStatsList(dataFileStats));
+		ReplaceInProgressPrefixPathWithFullPaths(dataFilePrefix, GetDataFilePathsFromStatsList(statsCollector->dataFileStats));
 
 	/* build a DataFileModification for each new data file */
 	List	   *modifications = NIL;
 	ListCell   *dataFileStatsCell = NULL;
 
-	foreach(dataFileStatsCell, dataFileStats)
+	foreach(dataFileStatsCell, statsCollector->dataFileStats)
 	{
 		DataFileStats *stats = lfirst(dataFileStatsCell);
 
@@ -532,18 +527,12 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			ReadDataStats stats = {sourceRowCount, existingDeletedRowCount};
 
-			List	   *dataFileStats = NIL;
 			List	   *leafFields = GetLeafFieldsForTable(relationId);
-			ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
-			{
-				.leafFields = leafFields,
-				.dataFileStats = &dataFileStats
-			};
-			PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
-									 deleteFile, newDataFilePath, compression,
-									 schema, &stats, &columnStatsCollector);
+			ColumnStatsCollector *statsCollector = PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
+																			deleteFile, newDataFilePath, compression,
+																			schema, &stats, leafFields);
 
-			ApplyColumnStatsModeForAllFileStats(relationId, *columnStatsCollector.dataFileStats);
+			ApplyColumnStatsModeForAllFileStats(relationId, statsCollector->dataFileStats);
 
 			int64		newRowCount = liveRowCount - deletedRowCount;
 
@@ -559,7 +548,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
-			List *newFileStatsList = *columnStatsCollector.dataFileStats;
+			List *newFileStatsList = statsCollector->dataFileStats;
 			Assert(newFileStatsList != NIL);
 
 			/*
@@ -599,7 +588,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			/* write the deletion file */
 			ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
-							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NULL);
+							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NIL);
 
 			ereport(WriteLogLevel, (errmsg("adding deletion file %s with " INT64_FORMAT " rows ",
 										   deletionFilePath, deletedRowCount)));
@@ -999,14 +988,8 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 	InsertInProgressFileRecordExtended(newDataFilePath, isPrefix, deferDeletion);
 
 	/* perform compaction */
-	List	   *dataFileStats = NIL;
 	List	   *leafFields = GetLeafFieldsForTable(relationId);
-	ColumnStatsCollector columnStatsCollector = (ColumnStatsCollector)
-	{
-		.leafFields = leafFields,
-		.dataFileStats = &dataFileStats
-	};
-	int64		rowCount =
+	ColumnStatsCollector *statsCollector =
 		WriteQueryResultTo(readQuery,
 						   newDataFilePath,
 						   properties.format,
@@ -1015,9 +998,9 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 						   queryHasRowId,
 						   schema,
 						   queryTupleDesc,
-						   &columnStatsCollector);
+						   leafFields);
 
-	if (rowCount == 0)
+	if (statsCollector->totalRowCount == 0)
 	{
 		TimestampTz orphanedAt = GetCurrentTransactionStartTimestamp();
 
@@ -1028,14 +1011,14 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 		return NIL;
 	}
 
-	ApplyColumnStatsModeForAllFileStats(relationId, dataFileStats);
+	ApplyColumnStatsModeForAllFileStats(relationId, statsCollector->dataFileStats);
 
 	/* find which files were generated */
 	List	   *newFiles = NIL;
-	List	   *newFileOps = GetNewFileOpsFromFileStats(relationId, dataFileStats,
-													partitionSpecId, partition,
-													rowCount,
-													isVerbose, &newFiles);
+	List	   *newFileOps = GetNewFileOpsFromFileStats(relationId, statsCollector->dataFileStats,
+														partitionSpecId, partition,
+														statsCollector->totalRowCount,
+														isVerbose, &newFiles);
 
 	/*
 	 * when we defer deletion of in-progress files, we need to replace the

From 4c8a83ee31fa7856d93270b0ee4e7eaac76ecbae Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Wed, 7 Jan 2026 19:40:02 +0300
Subject: [PATCH 33/46] fixup

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/delete_data.c | 6 +++---
 pg_lake_engine/src/pgduck/write_data.c  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index ee703fb8..5cca93d7 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -98,7 +98,7 @@ PerformDeleteFromParquet(char *sourcePath,
 	appendStringInfoString(&command, ")");
 
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
-	ColumnStatsCollector *result = NULL;
+	ColumnStatsCollector *statsCollector = NULL;
 
 	PG_TRY();
 	{
@@ -106,7 +106,7 @@ PerformDeleteFromParquet(char *sourcePath,
 
 		CheckPGDuckResult(pgDuckConn, result);
 
-		result = GetDataFileStatsListFromPGResult(result, leafFields, schema);
+		statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
 
 		PQclear(result);
 	}
@@ -116,7 +116,7 @@ PerformDeleteFromParquet(char *sourcePath,
 	}
 	PG_END_TRY();
 
-	return result;
+	return statsCollector;
 }
 
 
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 95d836c6..7b46a4f7 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -455,7 +455,7 @@ WriteQueryResultTo(char *query,
  * GetDataFileStatsListFromPGResult extracts DataFileStats list from the
  * given PGresult of COPY .. TO ... WITH (return_stats).
  *
- * It also returns the total row count via totalRowCount output parameter.
+ * It returns the collector object that contains the total row count and data file statistics.
  */
 ColumnStatsCollector *
 GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema)

From 3b9a2dfe11aa37dc32db68cfa28e099c2228b85c Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 14:00:05 +0300
Subject: [PATCH 34/46] fixup

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/src/pgduck/write_data.c | 10 +++++-----
 pg_lake_table/src/fdw/writable_table.c |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 7b46a4f7..a0d99188 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -419,7 +419,7 @@ WriteQueryResultTo(char *query,
 		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
 		CheckPGDuckResult(pgDuckConn, result);
 
-		if (destinationFormat == DATA_FORMAT_PARQUET && leafFields != NIL)
+		if (destinationFormat == DATA_FORMAT_PARQUET)
 		{
 			/* DuckDB returns COPY 0 when return_stats is used. */
 			statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
@@ -502,11 +502,11 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 		statsList = lappend(statsList, fileStats);
 	}
 
-	ColumnStatsCollector *statsResult = palloc0(sizeof(ColumnStatsCollector));
-	statsResult->totalRowCount = totalRowCount;
-	statsResult->dataFileStats = statsList;
+	ColumnStatsCollector *statsCollector = palloc0(sizeof(ColumnStatsCollector));
+	statsCollector->totalRowCount = totalRowCount;
+	statsCollector->dataFileStats = statsList;
 
-	return statsResult;
+	return statsCollector;
 }
 
 
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index a72a5017..13c666f7 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -548,14 +548,13 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
-			List *newFileStatsList = statsCollector->dataFileStats;
-			Assert(newFileStatsList != NIL);
+			Assert(statsCollector->dataFileStats != NIL);
 
 			/*
 			 * while deleting from parquet, we do not add file_size_bytes option to COPY command,
 			 * so we can assume that we'll have only a single file. 
 			 */
-			DataFileStats *newFileStats = linitial(newFileStatsList);
+			DataFileStats *newFileStats = linitial(statsCollector->dataFileStats);
 
 			/* store the new file in the metadata */
 			TableMetadataOperation *addOperation =

From fad3ea544707ec753673906f46f879e1d5c9ad46 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 14:30:20 +0300
Subject: [PATCH 35/46] Move CreateDataFileStatsForTable

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/multi_data_file_dest.c | 6 ++++++
 pg_lake_table/src/fdw/writable_table.c       | 5 +----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pg_lake_table/src/fdw/multi_data_file_dest.c b/pg_lake_table/src/fdw/multi_data_file_dest.c
index eede316c..16e88880 100644
--- a/pg_lake_table/src/fdw/multi_data_file_dest.c
+++ b/pg_lake_table/src/fdw/multi_data_file_dest.c
@@ -24,6 +24,7 @@
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/csv/csv_options.h"
 #include "pg_lake/csv/csv_writer.h"
+#include "pg_lake/fdw/data_file_stats.h"
 #include "pg_lake/fdw/data_files_catalog.h"
 #include "pg_lake/fdw/multi_data_file_dest.h"
 #include "pg_lake/fdw/writable_table.h"
@@ -238,6 +239,11 @@ FlushChildDestReceiver(MultiDataFileUploadDestReceiver * self)
 		{
 			copyModification->fileStats = DeepCopyDataFileStats(modification->fileStats);
 		}
+		else
+		{
+			copyModification->fileStats =
+				CreateDataFileStatsForTable(self->relationId, copyModification->insertFile, copyModification->insertedRowCount, 0, CONTENT_DATA);
+		}
 
 		/*
 		 * If caller of dest receiver is assigning rowids itself,
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 13c666f7..f39c6744 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -172,10 +172,7 @@ ApplyInsertFile(Relation rel, char *insertFile, int64 rowCount,
 	List	   *options = foreignTable->options;
 	bool		hasRowIds = GetBoolOption(options, "row_ids", false);
 
-	if (dataFileStats == NULL)
-	{
-		dataFileStats = CreateDataFileStatsForTable(relationId, insertFile, rowCount, 0, CONTENT_DATA);
-	}
+	Assert(dataFileStats != NULL);
 
 	List	   *metadataOperations = NIL;
 

From f7b993ae68efedcf79e553b84d302ae5073c2e59 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 15:10:56 +0300
Subject: [PATCH 36/46] Add ExecuteCopyCommandOnPGDuckConnection

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/pgduck/write_data.h       |  5 ++++
 pg_lake_engine/src/pgduck/delete_data.c       | 25 ++++---------------
 pg_lake_engine/src/pgduck/write_data.c        | 23 +++++++++++++++--
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 4cf38d41..9602b271 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -65,6 +65,11 @@ extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
 extern PGDLLEXPORT ColumnStatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
 																		  List *leafFields,
 																		  DataFileSchema * schema);
+extern PGDLLEXPORT ColumnStatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+																			  List *leafFields,
+																			  DataFileSchema * schema,
+																			  bool disablePreserveInsertionOrder,
+																			  CopyDataFormat destinationFormat);
 extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
 extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
 extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 5cca93d7..fa11be44 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -97,26 +97,11 @@ PerformDeleteFromParquet(char *sourcePath,
 	/* end WITH options */
 	appendStringInfoString(&command, ")");
 
-	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
-	ColumnStatsCollector *statsCollector = NULL;
-
-	PG_TRY();
-	{
-		PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
-
-		CheckPGDuckResult(pgDuckConn, result);
-
-		statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
-
-		PQclear(result);
-	}
-	PG_FINALLY();
-	{
-		ReleasePGDuckConnection(pgDuckConn);
-	}
-	PG_END_TRY();
-
-	return statsCollector;
+	return ExecuteCopyCommandOnPGDuckConnection(command.data,
+												leafFields,
+												schema,
+												false,
+												DATA_FORMAT_PARQUET);
 }
 
 
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index a0d99188..734cf143 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -402,9 +402,28 @@ WriteQueryResultTo(char *query,
 	/* end WITH options */
 	appendStringInfoString(&command, ")");
 
+	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
+	return ExecuteCopyCommandOnPGDuckConnection(command.data,
+												leafFields,
+												schema,
+												disablePreserveInsertionOrder,
+												destinationFormat);
+}
+
+
+/*
+ * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
+ * a PGDuck connection and returns a ColumnStatsCollector.
+ */
+ColumnStatsCollector *
+ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+									 List *leafFields,
+									 DataFileSchema * schema,
+									 bool disablePreserveInsertionOrder,
+									 CopyDataFormat destinationFormat)
+{
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
 	PGresult   *result;
-	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
 	ColumnStatsCollector *statsCollector = NULL;
 
 	PG_TRY();
@@ -416,7 +435,7 @@ WriteQueryResultTo(char *query,
 			PQclear(result);
 		}
 
-		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, command.data);
+		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, copyCommand);
 		CheckPGDuckResult(pgDuckConn, result);
 
 		if (destinationFormat == DATA_FORMAT_PARQUET)

From cfb770c60558b9e781a22294d2408ff3cb647d03 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 15:58:23 +0300
Subject: [PATCH 37/46] Handle modification in case stats is empty

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index f39c6744..438f2c48 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -272,16 +272,12 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 
 	ApplyColumnStatsModeForAllFileStats(relationId, statsCollector->dataFileStats);
 
-	if (!splitFilesBySize)
+	if (!splitFilesBySize && statsCollector->dataFileStats == NIL)
 	{
-		/* early return a single modification if not splitting files by size */
-		DataFileModification *modification = palloc0(sizeof(DataFileModification));
-		modification->type = ADD_DATA_FILE;
-		modification->insertFile = dataFilePrefix;
-		modification->insertedRowCount = rowCount;
-		modification->reservedRowIdStart = reservedRowIdStart;
-
-		return list_make1(modification);
+		DataFileStats *stats = palloc0(sizeof(DataFileStats));
+		stats->dataFilePath = dataFilePrefix;
+		stats->rowCount = rowCount;
+		statsCollector->dataFileStats = list_make1(stats);
 	}
 
 	/*

From 5b297b12ee2b22d9e351c158fc285d7f50c5f961 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 17:19:14 +0300
Subject: [PATCH 38/46] Move stats related logic to new file: data_file_stats.c

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../pg_lake/data_file/data_file_stats.h       |  19 +
 .../include/pg_lake/pgduck/write_data.h       |  18 +-
 .../src/data_file/data_file_stats.c           | 522 ++++++++++++++++++
 pg_lake_engine/src/pgduck/write_data.c        | 494 -----------------
 4 files changed, 542 insertions(+), 511 deletions(-)
 create mode 100644 pg_lake_engine/src/data_file/data_file_stats.c

diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index 355e58a9..4036f816 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -22,6 +22,7 @@
 #include "datatype/timestamp.h"
 
 #include "pg_lake/parquet/leaf_field.h"
+#include "pg_lake/pgduck/client.h"
 
  /*
   * DataFileColumnStats stores column statistics for a data file.
@@ -64,4 +65,22 @@ typedef struct DataFileStats
 	int64		rowIdStart;
 }			DataFileStats;
 
+typedef struct ColumnStatsCollector
+{
+	int64 totalRowCount;
+	List *dataFileStats;
+} ColumnStatsCollector;
+
 extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * stats);
+extern PGDLLEXPORT ColumnStatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
+																		  List *leafFields,
+																		  DataFileSchema * schema);
+extern PGDLLEXPORT ColumnStatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+																			  List *leafFields,
+																			  DataFileSchema * schema,
+																			  bool disablePreserveInsertionOrder,
+																			  CopyDataFormat destinationFormat);
+extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
+extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
+extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
+
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 9602b271..2b456805 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -20,6 +20,7 @@
 #include "access/tupdesc.h"
 #include "libpq-fe.h"
 #include "pg_lake/copy/copy_format.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/parquet/leaf_field.h"
 #include "nodes/pg_list.h"
@@ -34,12 +35,6 @@ typedef enum ParquetVersion
 	PARQUET_VERSION_V2 = 2
 } ParquetVersion;
 
-typedef struct ColumnStatsCollector
-{
-	int64 totalRowCount;
-	List *dataFileStats;
-} ColumnStatsCollector;
-
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
 
@@ -62,14 +57,3 @@ extern PGDLLEXPORT ColumnStatsCollector *WriteQueryResultTo(char *query,
 															TupleDesc queryTupleDesc,
 															List *leafFields);
 extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
-extern PGDLLEXPORT ColumnStatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
-																		  List *leafFields,
-																		  DataFileSchema * schema);
-extern PGDLLEXPORT ColumnStatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
-																			  List *leafFields,
-																			  DataFileSchema * schema,
-																			  bool disablePreserveInsertionOrder,
-																			  CopyDataFormat destinationFormat);
-extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
-extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
-extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
new file mode 100644
index 00000000..9f57d5ea
--- /dev/null
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright 2025 Snowflake Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+ * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
+ * a PGDuck connection and returns a ColumnStatsCollector.
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "pg_lake/data_file/data_files.h"
+#include "pg_lake/extensions/postgis.h"
+#include "pg_lake/pgduck/client.h"
+#include "pg_lake/pgduck/map.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+static void ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs);
+static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs);
+static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
+static const char *UnescapeDoubleQuotes(const char *s);
+static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
+static int FindIndexInStringList(List *names, const char *targetName);
+
+ColumnStatsCollector *
+ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+									 List *leafFields,
+									 DataFileSchema * schema,
+									 bool disablePreserveInsertionOrder,
+									 CopyDataFormat destinationFormat)
+{
+	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
+	PGresult   *result;
+	ColumnStatsCollector *statsCollector = NULL;
+
+	PG_TRY();
+	{
+		if (disablePreserveInsertionOrder)
+		{
+			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "SET preserve_insertion_order TO 'false';");
+			CheckPGDuckResult(pgDuckConn, result);
+			PQclear(result);
+		}
+
+		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, copyCommand);
+		CheckPGDuckResult(pgDuckConn, result);
+
+		if (destinationFormat == DATA_FORMAT_PARQUET)
+		{
+			/* DuckDB returns COPY 0 when return_stats is used. */
+			statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
+		}
+		else
+		{
+			char	   *commandTuples = PQcmdTuples(result);
+			statsCollector = palloc0(sizeof(ColumnStatsCollector));
+			statsCollector->totalRowCount = atoll(commandTuples);
+			statsCollector->dataFileStats = NIL;
+		}
+
+		PQclear(result);
+
+		if (disablePreserveInsertionOrder)
+		{
+			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "RESET preserve_insertion_order;");
+			CheckPGDuckResult(pgDuckConn, result);
+			PQclear(result);
+		}
+	}
+	PG_FINALLY();
+	{
+		ReleasePGDuckConnection(pgDuckConn);
+	}
+	PG_END_TRY();
+
+	return statsCollector;
+}
+
+
+/*
+ * GetDataFileStatsListFromPGResult extracts DataFileStats list from the
+ * given PGresult of COPY .. TO ... WITH (return_stats).
+ *
+ * It returns the collector object that contains the total row count and data file statistics.
+ */
+ColumnStatsCollector *
+GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema)
+{
+	List	   *statsList = NIL;
+
+	int			resultRowCount = PQntuples(result);
+	int			resultColumnCount = PQnfields(result);
+	int64 totalRowCount = 0;
+
+	for (int resultRowIndex = 0; resultRowIndex < resultRowCount; resultRowIndex++)
+	{
+		DataFileStats *fileStats = palloc0(sizeof(DataFileStats));
+
+		for (int resultColIndex = 0; resultColIndex < resultColumnCount; resultColIndex++)
+		{
+			char	   *resultColName = PQfname(result, resultColIndex);
+			char	   *resultValue = PQgetvalue(result, resultRowIndex, resultColIndex);
+
+			if (schema != NULL && strcmp(resultColName, "column_statistics") == 0)
+			{
+				List	   *names = NIL;
+				List	   *mins = NIL;
+				List	   *maxs = NIL;
+
+				ParseDuckdbColumnMinMaxFromText(resultValue, &names, &mins, &maxs);
+				fileStats->columnStats = GetDataFileColumnStatsList(names, mins, maxs, leafFields, schema);
+			}
+			else if (strcmp(resultColName, "file_size_bytes") == 0)
+			{
+				fileStats->fileSize = atoll(resultValue);
+			}
+			else if (strcmp(resultColName, "count") == 0)
+			{
+				fileStats->rowCount = atoll(resultValue);
+				totalRowCount += fileStats->rowCount;
+			}
+			else if (strcmp(resultColName, "filename") == 0)
+			{
+				fileStats->dataFilePath = pstrdup(resultValue);
+			}
+		}
+
+		statsList = lappend(statsList, fileStats);
+	}
+
+	ColumnStatsCollector *statsCollector = palloc0(sizeof(ColumnStatsCollector));
+	statsCollector->totalRowCount = totalRowCount;
+	statsCollector->dataFileStats = statsList;
+
+	return statsCollector;
+}
+
+
+/*
+ * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
+ * of type map(varchar,varchar).
+ */
+static void
+ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs)
+{
+	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
+
+	if (elementsArray == NULL)
+		return;
+
+	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
+
+	if (numElements == 0)
+		return;
+
+	char	   *minText = NULL;
+	char	   *maxText = NULL;
+
+	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
+	Datum		elemDatum;
+	bool		isNull = false;
+
+	while (array_iterate(arrayIterator, &elemDatum, &isNull))
+	{
+		if (isNull)
+			continue;
+
+		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
+		bool		statsKeyIsNull = false;
+		bool		statsValIsNull = false;
+
+		Datum		statsKeyDatum = GetAttributeByNum(tupleHeader, 1, &statsKeyIsNull);
+		Datum		statsValDatum = GetAttributeByNum(tupleHeader, 2, &statsValIsNull);
+
+		/* skip entries without a key or value */
+		if (statsKeyIsNull || statsValIsNull)
+			continue;
+
+		char	   *statsKey = TextDatumGetCString(statsKeyDatum);
+
+		if (strcmp(statsKey, "min") == 0)
+		{
+			Assert(minText == NULL);
+			minText = TextDatumGetCString(statsValDatum);
+		}
+		else if (strcmp(statsKey, "max") == 0)
+		{
+			Assert(maxText == NULL);
+			maxText = TextDatumGetCString(statsValDatum);
+		}
+	}
+
+	if (minText != NULL && maxText != NULL)
+	{
+		*names = lappend(*names, pstrdup(colName));
+		*mins = lappend(*mins, minText);
+		*maxs = lappend(*maxs, maxText);
+	}
+
+	array_free_iterator(arrayIterator);
+}
+
+
+/*
+ * UnescapeDoubleQuotes unescapes any doubled quotes.
+ * e.g. "ab\"\"cd\"\"ee" becomes "ab\"cd\"ee"
+ */
+static const char *
+UnescapeDoubleQuotes(const char *s)
+{
+	if (s == NULL)
+		return NULL;
+
+	char		doubleQuote = '"';
+
+	int			len = strlen(s);
+
+	if (len >= 2 && (s[0] == doubleQuote && s[len - 1] == doubleQuote))
+	{
+		/* Allocate worst-case length (without surrounding quotes) + 1 */
+		char	   *out = palloc((len - 1) * sizeof(char));
+		int			oi = 0;
+
+		for (int i = 1; i < len - 1; i++)
+		{
+			/* Handle "" */
+			if (s[i] == doubleQuote && i + 1 < len - 1 && s[i + 1] == doubleQuote)
+			{
+				out[oi++] = doubleQuote;
+				i++;			/* skip the doubled quote */
+			}
+			else
+			{
+				out[oi++] = s[i];
+			}
+		}
+
+		out[oi] = '\0';
+		return out;
+	}
+
+	return s;
+}
+
+
+/*
+ * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
+ * of type map(text,text).
+ */
+static void
+ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
+{
+	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
+
+	if (elementsArray == NULL)
+		return;
+
+	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
+
+	if (numElements == 0)
+		return;
+
+	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
+	Datum		elemDatum;
+	bool		isNull = false;
+
+	while (array_iterate(arrayIterator, &elemDatum, &isNull))
+	{
+		if (isNull)
+			continue;
+
+		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
+		bool		colNameIsNull = false;
+		bool		colStatsIsNull = false;
+
+		Datum		colNameDatum = GetAttributeByNum(tupleHeader, 1, &colNameIsNull);
+		Datum		colStatsDatum = GetAttributeByNum(tupleHeader, 2, &colStatsIsNull);
+
+		/* skip entries without a key or value */
+		if (colNameIsNull || colStatsIsNull)
+			continue;
+
+		char	   *colName = TextDatumGetCString(colNameDatum);
+
+		/*
+		 * pg_map text key is escaped for double quotes. We need to unescape
+		 * them.
+		 */
+		const char *unescapedColName = UnescapeDoubleQuotes(colName);
+
+		ExtractMinMaxForColumn(colStatsDatum, unescapedColName, names, mins, maxs);
+	}
+
+	array_free_iterator(arrayIterator);
+}
+
+
+/*
+ * ParseDuckdbColumnMinMaxFromText parses COPY .. TO .parquet WITH (return_stats)
+ * output text to map(text, map(text,text)).
+ * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...},
+ * 		  'name_col' => {'min' => 'aykut', 'max' => 'onder', ...},
+ *         ...
+ * 		}
+ */
+static void
+ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs)
+{
+	/*
+	 * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...}, 'name_col' =>
+	 * {'min' => 'aykut', 'max' => 'onder', ...}, ... }
+	 */
+	Oid			returnStatsMapId = GetOrCreatePGMapType("MAP(TEXT,MAP(TEXT,TEXT))");
+
+	if (returnStatsMapId == InvalidOid)
+		ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						errmsg("unexpected return_stats result %s", input)));
+
+	/* parse result into map above */
+	Oid			typinput;
+	Oid			typioparam;
+
+	getTypeInputInfo(returnStatsMapId, &typinput, &typioparam);
+
+	Datum		statsMapDatum = OidInputFunctionCall(typinput, input, typioparam, -1);
+
+	/*
+	 * extract min and max for each column: iterate the underlying map datum
+	 * directly to avoid invoking the set-returning `entries()` function in a
+	 * non-SRF context.
+	 */
+	ExtractMinMaxForAllColumns(statsMapDatum, names, mins, maxs);
+}
+
+
+/*
+ * GetDataFileColumnStatsList builds DataFileColumnStats list from given
+ * names, mins, maxs lists and schema.
+ */
+static List *
+GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema)
+{
+	List	   *columnStatsList = NIL;
+
+	Assert(schema != NULL);
+	for (int fieldIndex = 0; fieldIndex < schema->nfields; fieldIndex++)
+	{
+		DataFileSchemaField *field = &schema->fields[fieldIndex];
+		const char *fieldName = field->name;
+		int			fieldId = field->id;
+
+		int			nameIndex = FindIndexInStringList(names, fieldName);
+		if (nameIndex == -1)
+		{
+			ereport(DEBUG3, (errmsg("field with name %s not found in stats output, skipping", fieldName)));
+			continue;
+		}
+
+		LeafField  *leafField = FindLeafField(leafFields, fieldId);
+
+		if (leafField == NULL)
+		{
+			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
+			continue;
+		}
+		else if(ShouldSkipStatistics(leafField))
+		{
+			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
+			continue;
+		}
+
+		char	   *minStr = list_nth(mins, nameIndex);
+		char	   *maxStr = list_nth(maxs, nameIndex);
+
+		DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
+
+		colStats->leafField = *leafField;
+		colStats->lowerBoundText = pstrdup(minStr);
+		colStats->upperBoundText = pstrdup(maxStr);
+		columnStatsList = lappend(columnStatsList, colStats);
+	}
+
+	return columnStatsList;
+}
+
+
+/*
+* FindLeafField finds the leaf field with the given fieldId.
+*/
+LeafField *
+FindLeafField(List *leafFieldList, int fieldId)
+{
+	ListCell   *cell = NULL;
+	foreach(cell, leafFieldList)
+	{
+		LeafField  *leafField = (LeafField *) lfirst(cell);
+
+		if (leafField->fieldId == fieldId)
+		{
+			return leafField;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ * FindIndexInStringList finds the index of targetName in names list.
+ * Returns -1 if not found.
+ */
+static int
+FindIndexInStringList(List *names, const char *targetName)
+{
+	for(int index = 0; index < list_length(names); index++)
+	{
+		if (strcmp(list_nth(names, index), targetName) == 0)
+		{
+			return index;
+		}
+	}
+
+	return -1;
+}
+
+
+/*
+* ShouldSkipStatistics returns true if the statistics should be skipped for the
+* given leaf field.
+*/
+bool
+ShouldSkipStatistics(LeafField * leafField)
+{
+	Field	   *field = leafField->field;
+	PGType		pgType = leafField->pgType;
+
+	Oid			pgTypeOid = pgType.postgresTypeOid;
+
+	if (PGTypeRequiresConversionToIcebergString(field, pgType))
+	{
+		if (!(pgTypeOid == VARCHAROID || pgTypeOid == BPCHAROID ||
+			  pgTypeOid == CHAROID))
+		{
+			/*
+			 * Although there are no direct equivalents of these types on
+			 * Iceberg, it is pretty safe to support pruning on these types.
+			 */
+			return true;
+		}
+	}
+	else if (pgTypeOid == BYTEAOID)
+	{
+		/*
+		 * parquet_metadata function sometimes returns a varchar repr of blob,
+		 * which cannot be properly deserialized by Postgres. (when there is
+		 * "\" or nonprintable chars in the blob ) See issue Old repo:
+		 * issues/957
+		 */
+		return true;
+	}
+	else if (pgTypeOid == UUIDOID)
+	{
+		/*
+		 * DuckDB does not keep statistics for UUID type. We should skip
+		 * statistics for UUID type.
+		 */
+		return true;
+	}
+	else if (leafField->level != 1)
+	{
+		/*
+		 * We currently do not support pruning on array, map and composite
+		 * types. So there's no need to collect stats for them. Note that
+		 * in the past we did collect, and have some tests commented out,
+		 * such as skippedtest_pg_lake_iceberg_table_complex_values.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+
+/*
+ * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
+ * requires conversion to Iceberg string.
+ * Some of the Postgres types cannot be directly mapped to an Iceberg type.
+ * e.g. custom types like hstore
+ */
+bool
+PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
+{
+	/*
+	 * We treat geometry as binary within the Iceberg schema, which is encoded
+	 * as a hexadecimal string according to the spec. As it happens, the
+	 * Postgres output function of geometry produces a hexadecimal WKB string,
+	 * so we can use the regular text output function to convert to an Iceberg
+	 * value.
+	 */
+	if (IsGeometryTypeId(pgType.postgresTypeOid))
+	{
+		return true;
+	}
+
+	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
+}
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 734cf143..b52834b2 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -53,12 +53,6 @@ static DuckDBTypeInfo ChooseDuckDBEngineTypeForWrite(PGType postgresType,
 													 CopyDataFormat destinationFormat);
 static void AppendFieldIdValue(StringInfo map, Field * field, int fieldId);
 static const char *ParquetVersionToString(ParquetVersion version);
-static void ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs);
-static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs);
-static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
-static const char *UnescapeDoubleQuotes(const char *s);
-static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
-static int FindIndexInStringList(List *names, const char *targetName);
 
 static DuckDBTypeInfo VARCHAR_TYPE =
 {
@@ -411,494 +405,6 @@ WriteQueryResultTo(char *query,
 }
 
 
-/*
- * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
- * a PGDuck connection and returns a ColumnStatsCollector.
- */
-ColumnStatsCollector *
-ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
-									 List *leafFields,
-									 DataFileSchema * schema,
-									 bool disablePreserveInsertionOrder,
-									 CopyDataFormat destinationFormat)
-{
-	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
-	PGresult   *result;
-	ColumnStatsCollector *statsCollector = NULL;
-
-	PG_TRY();
-	{
-		if (disablePreserveInsertionOrder)
-		{
-			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "SET preserve_insertion_order TO 'false';");
-			CheckPGDuckResult(pgDuckConn, result);
-			PQclear(result);
-		}
-
-		result = ExecuteQueryOnPGDuckConnection(pgDuckConn, copyCommand);
-		CheckPGDuckResult(pgDuckConn, result);
-
-		if (destinationFormat == DATA_FORMAT_PARQUET)
-		{
-			/* DuckDB returns COPY 0 when return_stats is used. */
-			statsCollector = GetDataFileStatsListFromPGResult(result, leafFields, schema);
-		}
-		else
-		{
-			char	   *commandTuples = PQcmdTuples(result);
-			statsCollector = palloc0(sizeof(ColumnStatsCollector));
-			statsCollector->totalRowCount = atoll(commandTuples);
-			statsCollector->dataFileStats = NIL;
-		}
-
-		PQclear(result);
-
-		if (disablePreserveInsertionOrder)
-		{
-			result = ExecuteQueryOnPGDuckConnection(pgDuckConn, "RESET preserve_insertion_order;");
-			CheckPGDuckResult(pgDuckConn, result);
-			PQclear(result);
-		}
-	}
-	PG_FINALLY();
-	{
-		ReleasePGDuckConnection(pgDuckConn);
-	}
-	PG_END_TRY();
-
-	return statsCollector;
-}
-
-
-/*
- * GetDataFileStatsListFromPGResult extracts DataFileStats list from the
- * given PGresult of COPY .. TO ... WITH (return_stats).
- *
- * It returns the collector object that contains the total row count and data file statistics.
- */
-ColumnStatsCollector *
-GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema)
-{
-	List	   *statsList = NIL;
-
-	int			resultRowCount = PQntuples(result);
-	int			resultColumnCount = PQnfields(result);
-	int64 totalRowCount = 0;
-
-	for (int resultRowIndex = 0; resultRowIndex < resultRowCount; resultRowIndex++)
-	{
-		DataFileStats *fileStats = palloc0(sizeof(DataFileStats));
-
-		for (int resultColIndex = 0; resultColIndex < resultColumnCount; resultColIndex++)
-		{
-			char	   *resultColName = PQfname(result, resultColIndex);
-			char	   *resultValue = PQgetvalue(result, resultRowIndex, resultColIndex);
-
-			if (schema != NULL && strcmp(resultColName, "column_statistics") == 0)
-			{
-				List	   *names = NIL;
-				List	   *mins = NIL;
-				List	   *maxs = NIL;
-
-				ParseDuckdbColumnMinMaxFromText(resultValue, &names, &mins, &maxs);
-				fileStats->columnStats = GetDataFileColumnStatsList(names, mins, maxs, leafFields, schema);
-			}
-			else if (strcmp(resultColName, "file_size_bytes") == 0)
-			{
-				fileStats->fileSize = atoll(resultValue);
-			}
-			else if (strcmp(resultColName, "count") == 0)
-			{
-				fileStats->rowCount = atoll(resultValue);
-				totalRowCount += fileStats->rowCount;
-			}
-			else if (strcmp(resultColName, "filename") == 0)
-			{
-				fileStats->dataFilePath = pstrdup(resultValue);
-			}
-		}
-
-		statsList = lappend(statsList, fileStats);
-	}
-
-	ColumnStatsCollector *statsCollector = palloc0(sizeof(ColumnStatsCollector));
-	statsCollector->totalRowCount = totalRowCount;
-	statsCollector->dataFileStats = statsList;
-
-	return statsCollector;
-}
-
-
-/*
- * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
- * of type map(varchar,varchar).
- */
-static void
-ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs)
-{
-	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
-
-	if (elementsArray == NULL)
-		return;
-
-	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
-
-	if (numElements == 0)
-		return;
-
-	char	   *minText = NULL;
-	char	   *maxText = NULL;
-
-	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
-	Datum		elemDatum;
-	bool		isNull = false;
-
-	while (array_iterate(arrayIterator, &elemDatum, &isNull))
-	{
-		if (isNull)
-			continue;
-
-		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
-		bool		statsKeyIsNull = false;
-		bool		statsValIsNull = false;
-
-		Datum		statsKeyDatum = GetAttributeByNum(tupleHeader, 1, &statsKeyIsNull);
-		Datum		statsValDatum = GetAttributeByNum(tupleHeader, 2, &statsValIsNull);
-
-		/* skip entries without a key or value */
-		if (statsKeyIsNull || statsValIsNull)
-			continue;
-
-		char	   *statsKey = TextDatumGetCString(statsKeyDatum);
-
-		if (strcmp(statsKey, "min") == 0)
-		{
-			Assert(minText == NULL);
-			minText = TextDatumGetCString(statsValDatum);
-		}
-		else if (strcmp(statsKey, "max") == 0)
-		{
-			Assert(maxText == NULL);
-			maxText = TextDatumGetCString(statsValDatum);
-		}
-	}
-
-	if (minText != NULL && maxText != NULL)
-	{
-		*names = lappend(*names, pstrdup(colName));
-		*mins = lappend(*mins, minText);
-		*maxs = lappend(*maxs, maxText);
-	}
-
-	array_free_iterator(arrayIterator);
-}
-
-
-/*
- * UnescapeDoubleQuotes unescapes any doubled quotes.
- * e.g. "ab\"\"cd\"\"ee" becomes "ab\"cd\"ee"
- */
-static const char *
-UnescapeDoubleQuotes(const char *s)
-{
-	if (s == NULL)
-		return NULL;
-
-	char		doubleQuote = '"';
-
-	int			len = strlen(s);
-
-	if (len >= 2 && (s[0] == doubleQuote && s[len - 1] == doubleQuote))
-	{
-		/* Allocate worst-case length (without surrounding quotes) + 1 */
-		char	   *out = palloc((len - 1) * sizeof(char));
-		int			oi = 0;
-
-		for (int i = 1; i < len - 1; i++)
-		{
-			/* Handle "" */
-			if (s[i] == doubleQuote && i + 1 < len - 1 && s[i + 1] == doubleQuote)
-			{
-				out[oi++] = doubleQuote;
-				i++;			/* skip the doubled quote */
-			}
-			else
-			{
-				out[oi++] = s[i];
-			}
-		}
-
-		out[oi] = '\0';
-		return out;
-	}
-
-	return s;
-}
-
-
-/*
- * ExtractMinMaxFromStatsMapDatum extracts min and max values from given stats map
- * of type map(text,text).
- */
-static void
-ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
-{
-	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
-
-	if (elementsArray == NULL)
-		return;
-
-	uint32		numElements = ArrayGetNItems(ARR_NDIM(elementsArray), ARR_DIMS(elementsArray));
-
-	if (numElements == 0)
-		return;
-
-	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
-	Datum		elemDatum;
-	bool		isNull = false;
-
-	while (array_iterate(arrayIterator, &elemDatum, &isNull))
-	{
-		if (isNull)
-			continue;
-
-		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
-		bool		colNameIsNull = false;
-		bool		colStatsIsNull = false;
-
-		Datum		colNameDatum = GetAttributeByNum(tupleHeader, 1, &colNameIsNull);
-		Datum		colStatsDatum = GetAttributeByNum(tupleHeader, 2, &colStatsIsNull);
-
-		/* skip entries without a key or value */
-		if (colNameIsNull || colStatsIsNull)
-			continue;
-
-		char	   *colName = TextDatumGetCString(colNameDatum);
-
-		/*
-		 * pg_map text key is escaped for double quotes. We need to unescape
-		 * them.
-		 */
-		const char *unescapedColName = UnescapeDoubleQuotes(colName);
-
-		ExtractMinMaxForColumn(colStatsDatum, unescapedColName, names, mins, maxs);
-	}
-
-	array_free_iterator(arrayIterator);
-}
-
-
-/*
- * ParseDuckdbColumnMinMaxFromText parses COPY .. TO .parquet WITH (return_stats)
- * output text to map(text, map(text,text)).
- * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...},
- * 		  'name_col' => {'min' => 'aykut', 'max' => 'onder', ...},
- *         ...
- * 		}
- */
-static void
-ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs)
-{
-	/*
-	 * e.g. { 'id_col' => {'min' => '12', 'max' => 23, ...}, 'name_col' =>
-	 * {'min' => 'aykut', 'max' => 'onder', ...}, ... }
-	 */
-	Oid			returnStatsMapId = GetOrCreatePGMapType("MAP(TEXT,MAP(TEXT,TEXT))");
-
-	if (returnStatsMapId == InvalidOid)
-		ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						errmsg("unexpected return_stats result %s", input)));
-
-	/* parse result into map above */
-	Oid			typinput;
-	Oid			typioparam;
-
-	getTypeInputInfo(returnStatsMapId, &typinput, &typioparam);
-
-	Datum		statsMapDatum = OidInputFunctionCall(typinput, input, typioparam, -1);
-
-	/*
-	 * extract min and max for each column: iterate the underlying map datum
-	 * directly to avoid invoking the set-returning `entries()` function in a
-	 * non-SRF context.
-	 */
-	ExtractMinMaxForAllColumns(statsMapDatum, names, mins, maxs);
-}
-
-
-/*
- * GetDataFileColumnStatsList builds DataFileColumnStats list from given
- * names, mins, maxs lists and schema.
- */
-static List *
-GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema)
-{
-	List	   *columnStatsList = NIL;
-
-	Assert(schema != NULL);
-	for (int fieldIndex = 0; fieldIndex < schema->nfields; fieldIndex++)
-	{
-		DataFileSchemaField *field = &schema->fields[fieldIndex];
-		const char *fieldName = field->name;
-		int			fieldId = field->id;
-
-		int			nameIndex = FindIndexInStringList(names, fieldName);
-		if (nameIndex == -1)
-		{
-			ereport(DEBUG3, (errmsg("field with name %s not found in stats output, skipping", fieldName)));
-			continue;
-		}
-
-		LeafField  *leafField = FindLeafField(leafFields, fieldId);
-
-		if (leafField == NULL)
-		{
-			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
-			continue;
-		}
-		else if(ShouldSkipStatistics(leafField))
-		{
-			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
-			continue;
-		}
-
-		char	   *minStr = list_nth(mins, nameIndex);
-		char	   *maxStr = list_nth(maxs, nameIndex);
-
-		DataFileColumnStats *colStats = palloc0(sizeof(DataFileColumnStats));
-
-		colStats->leafField = *leafField;
-		colStats->lowerBoundText = pstrdup(minStr);
-		colStats->upperBoundText = pstrdup(maxStr);
-		columnStatsList = lappend(columnStatsList, colStats);
-	}
-
-	return columnStatsList;
-}
-
-
-/*
-* FindLeafField finds the leaf field with the given fieldId.
-*/
-LeafField *
-FindLeafField(List *leafFieldList, int fieldId)
-{
-	ListCell   *cell = NULL;
-	foreach(cell, leafFieldList)
-	{
-		LeafField  *leafField = (LeafField *) lfirst(cell);
-
-		if (leafField->fieldId == fieldId)
-		{
-			return leafField;
-		}
-	}
-
-	return NULL;
-}
-
-
-/*
- * FindIndexInStringList finds the index of targetName in names list.
- * Returns -1 if not found.
- */
-static int
-FindIndexInStringList(List *names, const char *targetName)
-{
-	for(int index = 0; index < list_length(names); index++)
-	{
-		if (strcmp(list_nth(names, index), targetName) == 0)
-		{
-			return index;
-		}
-	}
-
-	return -1;
-}
-
-
-/*
-* ShouldSkipStatistics returns true if the statistics should be skipped for the
-* given leaf field.
-*/
-bool
-ShouldSkipStatistics(LeafField * leafField)
-{
-	Field	   *field = leafField->field;
-	PGType		pgType = leafField->pgType;
-
-	Oid			pgTypeOid = pgType.postgresTypeOid;
-
-	if (PGTypeRequiresConversionToIcebergString(field, pgType))
-	{
-		if (!(pgTypeOid == VARCHAROID || pgTypeOid == BPCHAROID ||
-			  pgTypeOid == CHAROID))
-		{
-			/*
-			 * Although there are no direct equivalents of these types on
-			 * Iceberg, it is pretty safe to support pruning on these types.
-			 */
-			return true;
-		}
-	}
-	else if (pgTypeOid == BYTEAOID)
-	{
-		/*
-		 * parquet_metadata function sometimes returns a varchar repr of blob,
-		 * which cannot be properly deserialized by Postgres. (when there is
-		 * "\" or nonprintable chars in the blob ) See issue Old repo:
-		 * issues/957
-		 */
-		return true;
-	}
-	else if (pgTypeOid == UUIDOID)
-	{
-		/*
-		 * DuckDB does not keep statistics for UUID type. We should skip
-		 * statistics for UUID type.
-		 */
-		return true;
-	}
-	else if (leafField->level != 1)
-	{
-		/*
-		 * We currently do not support pruning on array, map and composite
-		 * types. So there's no need to collect stats for them. Note that
-		 * in the past we did collect, and have some tests commented out,
-		 * such as skippedtest_pg_lake_iceberg_table_complex_values.
-		 */
-		return true;
-	}
-
-	return false;
-}
-
-
-/*
- * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
- * requires conversion to Iceberg string.
- * Some of the Postgres types cannot be directly mapped to an Iceberg type.
- * e.g. custom types like hstore
- */
-bool
-PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
-{
-	/*
-	 * We treat geometry as binary within the Iceberg schema, which is encoded
-	 * as a hexadecimal string according to the spec. As it happens, the
-	 * Postgres output function of geometry produces a hexadecimal WKB string,
-	 * so we can use the regular text output function to convert to an Iceberg
-	 * value.
-	 */
-	if (IsGeometryTypeId(pgType.postgresTypeOid))
-	{
-		return true;
-	}
-
-	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
-}
-
-
 /*
  * TupleDescToProjectionList converts a PostgreSQL tuple descriptor to
  * projection list in string form that can be used for writes.

From 887ac716bfbf505928f1a32cbc6cfaccecdfc7b6 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 17:33:51 +0300
Subject: [PATCH 39/46] Move field&leaf field functions

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../pg_lake/data_file/data_file_stats.h       |  3 -
 .../include/pg_lake/parquet/field.h           |  2 +
 .../include/pg_lake/parquet/leaf_field.h      |  1 +
 .../src/data_file/data_file_stats.c           | 57 ++-----------------
 pg_lake_engine/src/parquet/field.c            | 26 +++++++++
 pg_lake_engine/src/parquet/leaf_field.c       | 21 +++++++
 6 files changed, 56 insertions(+), 54 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index 4036f816..2915e378 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -80,7 +80,4 @@ extern PGDLLEXPORT ColumnStatsCollector *ExecuteCopyCommandOnPGDuckConnection(ch
 																			  DataFileSchema * schema,
 																			  bool disablePreserveInsertionOrder,
 																			  CopyDataFormat destinationFormat);
-extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
 extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
-extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
-
diff --git a/pg_lake_engine/include/pg_lake/parquet/field.h b/pg_lake_engine/include/pg_lake/parquet/field.h
index ee23b47f..9e544e9d 100644
--- a/pg_lake_engine/include/pg_lake/parquet/field.h
+++ b/pg_lake_engine/include/pg_lake/parquet/field.h
@@ -34,6 +34,7 @@
 #pragma once
 
 #include "nodes/pg_list.h"
+#include "pg_lake/pgduck/type.h"
 
 /*
  * Reserved _row_id field ID used for Iceberg
@@ -155,3 +156,4 @@ typedef FieldStructElement DataFileSchemaField;
 
 extern PGDLLEXPORT DataFileSchema * DeepCopyDataFileSchema(const DataFileSchema * schema);
 extern PGDLLEXPORT Field * DeepCopyField(const Field * field);
+extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
diff --git a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
index 624a41e7..d77e5b69 100644
--- a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
+++ b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
@@ -52,6 +52,7 @@ typedef struct LeafField
 extern PGDLLEXPORT int LeafFieldCompare(const ListCell *a, const ListCell *b);
 extern PGDLLEXPORT bool SchemaFieldsEquivalent(DataFileSchemaField * fieldA, DataFileSchemaField * fieldB);
 extern PGDLLEXPORT LeafField DeepCopyLeafField(const LeafField * leafField);
+extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
 #if PG_VERSION_NUM < 170000
 extern PGDLLEXPORT int pg_cmp_s32(int32 a, int32 b);
 #endif
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index 9f57d5ea..094d799b 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -15,15 +15,11 @@
  * limitations under the License.
  */
 
- /*
- * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
- * a PGDuck connection and returns a ColumnStatsCollector.
- */
-
 #include "postgres.h"
 
 #include "executor/executor.h"
 #include "pg_lake/data_file/data_files.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/extensions/postgis.h"
 #include "pg_lake/pgduck/client.h"
 #include "pg_lake/pgduck/map.h"
@@ -38,6 +34,11 @@ static const char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
 static int FindIndexInStringList(List *names, const char *targetName);
 
+
+/*
+ * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
+ * a PGDuck connection and returns a ColumnStatsCollector.
+ */
 ColumnStatsCollector *
 ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 									 List *leafFields,
@@ -400,27 +401,6 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 }
 
 
-/*
-* FindLeafField finds the leaf field with the given fieldId.
-*/
-LeafField *
-FindLeafField(List *leafFieldList, int fieldId)
-{
-	ListCell   *cell = NULL;
-	foreach(cell, leafFieldList)
-	{
-		LeafField  *leafField = (LeafField *) lfirst(cell);
-
-		if (leafField->fieldId == fieldId)
-		{
-			return leafField;
-		}
-	}
-
-	return NULL;
-}
-
-
 /*
  * FindIndexInStringList finds the index of targetName in names list.
  * Returns -1 if not found.
@@ -495,28 +475,3 @@ ShouldSkipStatistics(LeafField * leafField)
 
 	return false;
 }
-
-
-/*
- * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
- * requires conversion to Iceberg string.
- * Some of the Postgres types cannot be directly mapped to an Iceberg type.
- * e.g. custom types like hstore
- */
-bool
-PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
-{
-	/*
-	 * We treat geometry as binary within the Iceberg schema, which is encoded
-	 * as a hexadecimal string according to the spec. As it happens, the
-	 * Postgres output function of geometry produces a hexadecimal WKB string,
-	 * so we can use the regular text output function to convert to an Iceberg
-	 * value.
-	 */
-	if (IsGeometryTypeId(pgType.postgresTypeOid))
-	{
-		return true;
-	}
-
-	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
-}
diff --git a/pg_lake_engine/src/parquet/field.c b/pg_lake_engine/src/parquet/field.c
index b377f430..8acdfb45 100644
--- a/pg_lake_engine/src/parquet/field.c
+++ b/pg_lake_engine/src/parquet/field.c
@@ -19,6 +19,7 @@
 
 #include "common/int.h"
 
+#include "pg_lake/extensions/postgis.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/parquet/leaf_field.h"
 #include "pg_lake/util/string_utils.h"
@@ -182,3 +183,28 @@ SchemaFieldsEquivalent(DataFileSchemaField * fieldA, DataFileSchemaField * field
 	 */
 	return true;
 }
+
+
+/*
+ * PGTypeRequiresConversionToIcebergString returns true if the given Postgres type
+ * requires conversion to Iceberg string.
+ * Some of the Postgres types cannot be directly mapped to an Iceberg type.
+ * e.g. custom types like hstore
+ */
+bool
+PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType)
+{
+	/*
+	 * We treat geometry as binary within the Iceberg schema, which is encoded
+	 * as a hexadecimal string according to the spec. As it happens, the
+	 * Postgres output function of geometry produces a hexadecimal WKB string,
+	 * so we can use the regular text output function to convert to an Iceberg
+	 * value.
+	 */
+	if (IsGeometryTypeId(pgType.postgresTypeOid))
+	{
+		return true;
+	}
+
+	return strcmp(field->field.scalar.typeName, "string") == 0 && pgType.postgresTypeOid != TEXTOID;
+}
diff --git a/pg_lake_engine/src/parquet/leaf_field.c b/pg_lake_engine/src/parquet/leaf_field.c
index 9e7e350c..c387b37b 100644
--- a/pg_lake_engine/src/parquet/leaf_field.c
+++ b/pg_lake_engine/src/parquet/leaf_field.c
@@ -39,3 +39,24 @@ DeepCopyLeafField(const LeafField * leafField)
 
 	return *copiedLeafField;
 }
+
+
+/*
+* FindLeafField finds the leaf field with the given fieldId.
+*/
+LeafField *
+FindLeafField(List *leafFieldList, int fieldId)
+{
+	ListCell   *cell = NULL;
+	foreach(cell, leafFieldList)
+	{
+		LeafField  *leafField = (LeafField *) lfirst(cell);
+
+		if (leafField->fieldId == fieldId)
+		{
+			return leafField;
+		}
+	}
+
+	return NULL;
+}

From 2db85e2b80d2019fd724c5e9f6ce39af6f3887a1 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Thu, 8 Jan 2026 18:07:14 +0300
Subject: [PATCH 40/46] Remove unnecessary includes and whitespaces

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/include/pg_lake/pgduck/delete_data.h     | 1 -
 pg_lake_engine/include/pg_lake/pgduck/write_data.h      | 2 --
 pg_lake_engine/src/init.c                               | 1 -
 pg_lake_engine/src/pgduck/write_data.c                  | 6 +-----
 pg_lake_iceberg/src/iceberg/iceberg_field.c             | 1 -
 pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c | 1 -
 pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c   | 1 -
 pg_lake_iceberg/src/init.c                              | 1 +
 pg_lake_iceberg/src/test/test_iceberg_binary_serde.c    | 1 -
 9 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index da5170e5..9795f52e 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -22,7 +22,6 @@
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/pgduck/read_data.h"
-#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/data_file/data_file_stats.h"
 
 extern PGDLLEXPORT ColumnStatsCollector *PerformDeleteFromParquet(char *sourceDataFilePath,
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 2b456805..d6a49019 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -18,11 +18,9 @@
 #pragma once
 
 #include "access/tupdesc.h"
-#include "libpq-fe.h"
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/parquet/field.h"
-#include "pg_lake/parquet/leaf_field.h"
 #include "nodes/pg_list.h"
 
 /* pg_lake_table.target_row_group_size_mb */
diff --git a/pg_lake_engine/src/init.c b/pg_lake_engine/src/init.c
index c0c8f587..4e0e41fe 100644
--- a/pg_lake_engine/src/init.c
+++ b/pg_lake_engine/src/init.c
@@ -42,7 +42,6 @@
 #include "pg_lake/extensions/extension_ids.h"
 #include "pg_lake/pgduck/cache_worker.h"
 #include "pg_lake/pgduck/client.h"
-#include "pg_lake/pgduck/write_data.h"
 #include "utils/guc.h"
 
 PG_MODULE_MAGIC;
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index b52834b2..8cf7eb2d 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -19,23 +19,18 @@
  * Functions for generating query for writing data via pgduck server.
  */
 #include "postgres.h"
-#include "fmgr.h"
 
 #include "access/tupdesc.h"
-#include "catalog/pg_type.h"
 #include "commands/defrem.h"
 #include "common/string.h"
-#include "executor/executor.h"
 #include "pg_lake/csv/csv_options.h"
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/data_file/data_file_stats.h"
-#include "pg_lake/extensions/pg_map.h"
 #include "pg_lake/extensions/postgis.h"
 #include "pg_lake/parquet/field.h"
 #include "pg_lake/parquet/geoparquet.h"
 #include "pg_lake/parsetree/options.h"
 #include "pg_lake/pgduck/client.h"
-#include "pg_lake/pgduck/map.h"
 #include "pg_lake/pgduck/numeric.h"
 #include "pg_lake/pgduck/read_data.h"
 #include "pg_lake/pgduck/type.h"
@@ -62,6 +57,7 @@ static DuckDBTypeInfo VARCHAR_TYPE =
 int			TargetRowGroupSizeMB = DEFAULT_TARGET_ROW_GROUP_SIZE_MB;
 int			DefaultParquetVersion = PARQUET_VERSION_V1;
 
+
 /*
  * ConvertCSVFileTo copies and converts a CSV file at source path to
  * the destinationPath.
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index e16e208a..58e2e23f 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -46,7 +46,6 @@
 #include "pg_lake/pgduck/numeric.h"
 #include "pg_lake/pgduck/serialize.h"
 #include "pg_lake/pgduck/type.h"
-#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/string_utils.h"
 
 #include "access/table.h"
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c b/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
index a3d5755e..73111828 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_type_binary_serde.c
@@ -26,7 +26,6 @@
 #include "pg_lake/iceberg/iceberg_type_binary_serde.h"
 #include "pg_lake/iceberg/iceberg_type_numeric_binary_serde.h"
 #include "pg_lake/iceberg/utils.h"
-#include "pg_lake/pgduck/write_data.h"
 
 #include "port/pg_bswap.h"
 #include "utils/builtins.h"
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c b/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
index 5110f08c..e2c0b951 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_type_json_serde.c
@@ -27,7 +27,6 @@
 #include "pg_lake/iceberg/iceberg_type_json_serde.h"
 #include "pg_lake/json/json_utils.h"
 #include "pg_lake/pgduck/map.h"
-#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/spi_helpers.h"
 
 #include "access/tupdesc.h"
diff --git a/pg_lake_iceberg/src/init.c b/pg_lake_iceberg/src/init.c
index 5bd95bf6..a929d6ca 100644
--- a/pg_lake_iceberg/src/init.c
+++ b/pg_lake_iceberg/src/init.c
@@ -139,6 +139,7 @@ _PG_init(void)
 							 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
 							 NULL, NULL, NULL);
 
+
 	DefineCustomBoolVariable(
 							 "pg_lake_iceberg.http_client_trace_traffic",
 							 gettext_noop("When set to true, HTTP client logging is enabled."),
diff --git a/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c b/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
index 12cd559f..616d61f5 100644
--- a/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
+++ b/pg_lake_iceberg/src/test/test_iceberg_binary_serde.c
@@ -24,7 +24,6 @@
 #include "pg_lake/iceberg/iceberg_type_binary_serde.h"
 #include "pg_lake/parquet/leaf_field.h"
 #include "pg_lake/pgduck/type.h"
-#include "pg_lake/pgduck/write_data.h"
 #include "pg_lake/util/numeric.h"
 
 #include "utils/builtins.h"

From 5647b47ac25c953359a1b7f5c4e9ec76409cb750 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Fri, 9 Jan 2026 12:34:22 +0300
Subject: [PATCH 41/46] Use returned stats for deleted files

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_table/src/fdw/writable_table.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 438f2c48..dc9dafdd 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -578,15 +578,20 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			InsertInProgressFileRecordExtended(deletionFilePath, isPrefix, deferDeletion);
 
+			List *leafFields = GetLeafFieldsForTable(relationId);
 			/* write the deletion file */
-			ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
-							 DATA_FORMAT_PARQUET, compression, copyOptions, schema, NIL);
+			ColumnStatsCollector *statsCollector =
+				ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
+								 DATA_FORMAT_PARQUET, compression, copyOptions, schema, leafFields);
 
 			ereport(WriteLogLevel, (errmsg("adding deletion file %s with " INT64_FORMAT " rows ",
 										   deletionFilePath, deletedRowCount)));
 
-			DataFileStats *deletionFileStats = CreateDataFileStatsForTable(relationId, deletionFilePath,
-																		   deletedRowCount, 0, CONTENT_POSITION_DELETES);
+			/*
+			 * ConvertCSVFileTo() does not use file_bytes_size so we can assume single file 
+			 */
+            Assert(list_length(statsCollector->dataFileStats) == 1);
+			DataFileStats *deletionFileStats = linitial(statsCollector->dataFileStats);
 
 			/*
 			 * We are adding position delete file with the same partition

From e3b51e7191d8222d7534a5a09fb73ab2062bd745 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Fri, 9 Jan 2026 13:38:52 +0300
Subject: [PATCH 42/46] Rename ColumnStatsCollector to StatsCollector

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../include/pg_lake/data_file/data_file_stats.h      |  8 ++++----
 pg_lake_engine/include/pg_lake/pgduck/delete_data.h  |  2 +-
 pg_lake_engine/include/pg_lake/pgduck/write_data.h   |  4 ++--
 pg_lake_engine/src/data_file/data_file_stats.c       | 12 ++++++------
 pg_lake_engine/src/pgduck/delete_data.c              |  2 +-
 pg_lake_engine/src/pgduck/write_data.c               |  4 ++--
 pg_lake_table/src/fdw/writable_table.c               |  8 ++++----
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index 2915e378..0497ee91 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -65,17 +65,17 @@ typedef struct DataFileStats
 	int64		rowIdStart;
 }			DataFileStats;
 
-typedef struct ColumnStatsCollector
+typedef struct StatsCollector
 {
 	int64 totalRowCount;
 	List *dataFileStats;
-} ColumnStatsCollector;
+} StatsCollector;
 
 extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * stats);
-extern PGDLLEXPORT ColumnStatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
+extern PGDLLEXPORT StatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
 																		  List *leafFields,
 																		  DataFileSchema * schema);
-extern PGDLLEXPORT ColumnStatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+extern PGDLLEXPORT StatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 																			  List *leafFields,
 																			  DataFileSchema * schema,
 																			  bool disablePreserveInsertionOrder,
diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index 9795f52e..a21d284e 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -24,7 +24,7 @@
 #include "pg_lake/pgduck/read_data.h"
 #include "pg_lake/data_file/data_file_stats.h"
 
-extern PGDLLEXPORT ColumnStatsCollector *PerformDeleteFromParquet(char *sourceDataFilePath,
+extern PGDLLEXPORT StatsCollector *PerformDeleteFromParquet(char *sourceDataFilePath,
 																  List *positionDeleteFiles,
 																  char *deletionFilePath,
 																  char *destinationPath,
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index d6a49019..7084340b 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -36,7 +36,7 @@ typedef enum ParquetVersion
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
 
-extern PGDLLEXPORT ColumnStatsCollector *ConvertCSVFileTo(char *csvFilePath,
+extern PGDLLEXPORT StatsCollector *ConvertCSVFileTo(char *csvFilePath,
 														  TupleDesc tupleDesc,
 														  int maxLineSize,
 														  char *destinationPath,
@@ -45,7 +45,7 @@ extern PGDLLEXPORT ColumnStatsCollector *ConvertCSVFileTo(char *csvFilePath,
 														  List *formatOptions,
 														  DataFileSchema * schema,
 														  List *leafFields);
-extern PGDLLEXPORT ColumnStatsCollector *WriteQueryResultTo(char *query,
+extern PGDLLEXPORT StatsCollector *WriteQueryResultTo(char *query,
 															char *destinationPath,
 															CopyDataFormat destinationFormat,
 															CopyDataCompression destinationCompression,
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index 094d799b..960e64f5 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -37,9 +37,9 @@ static int FindIndexInStringList(List *names, const char *targetName);
 
 /*
  * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
- * a PGDuck connection and returns a ColumnStatsCollector.
+ * a PGDuck connection and returns a StatsCollector.
  */
-ColumnStatsCollector *
+StatsCollector *
 ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 									 List *leafFields,
 									 DataFileSchema * schema,
@@ -48,7 +48,7 @@ ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 {
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
 	PGresult   *result;
-	ColumnStatsCollector *statsCollector = NULL;
+	StatsCollector *statsCollector = NULL;
 
 	PG_TRY();
 	{
@@ -70,7 +70,7 @@ ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 		else
 		{
 			char	   *commandTuples = PQcmdTuples(result);
-			statsCollector = palloc0(sizeof(ColumnStatsCollector));
+			statsCollector = palloc0(sizeof(StatsCollector));
 			statsCollector->totalRowCount = atoll(commandTuples);
 			statsCollector->dataFileStats = NIL;
 		}
@@ -100,7 +100,7 @@ ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
  *
  * It returns the collector object that contains the total row count and data file statistics.
  */
-ColumnStatsCollector *
+StatsCollector *
 GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSchema * schema)
 {
 	List	   *statsList = NIL;
@@ -145,7 +145,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 		statsList = lappend(statsList, fileStats);
 	}
 
-	ColumnStatsCollector *statsCollector = palloc0(sizeof(ColumnStatsCollector));
+	StatsCollector *statsCollector = palloc0(sizeof(StatsCollector));
 	statsCollector->totalRowCount = totalRowCount;
 	statsCollector->dataFileStats = statsList;
 
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index fa11be44..0e4071c9 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -47,7 +47,7 @@ static char *DeleteFromParquetQuery(char *sourceDataFilePath,
  * PerformDeleteFromParquet applies a deletion CSV file to a Parquet file
  * and writes the new Parquet file to destinationPath.
  */
-ColumnStatsCollector *
+StatsCollector *
 PerformDeleteFromParquet(char *sourcePath,
 						 List *positionDeleteFiles,
 						 char *deletionFilePath,
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 8cf7eb2d..8fb997a5 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -64,7 +64,7 @@ int			DefaultParquetVersion = PARQUET_VERSION_V1;
  *
  * The CSV was generated using COPY ... TO '<csvFilePath>'
  */
-ColumnStatsCollector *
+StatsCollector *
 ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
 				 char *destinationPath,
 				 CopyDataFormat destinationFormat,
@@ -151,7 +151,7 @@ ConvertCSVFileTo(char *csvFilePath, TupleDesc csvTupleDesc, int maxLineSize,
  * destinationPath. There may be multiple files if file_size_bytes
  * is specified in formatOptions.
  */
-ColumnStatsCollector *
+StatsCollector *
 WriteQueryResultTo(char *query,
 				   char *destinationPath,
 				   CopyDataFormat destinationFormat,
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index dc9dafdd..bd589620 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -259,7 +259,7 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	List	   *leafFields = GetLeafFieldsForTable(relationId);
 
 	/* convert insert file to a new file in table format */
-	ColumnStatsCollector *statsCollector =
+	StatsCollector *statsCollector =
 		ConvertCSVFileTo(insertCSV,
 						 tupleDescriptor,
 						 maximumLineSize,
@@ -521,7 +521,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			ReadDataStats stats = {sourceRowCount, existingDeletedRowCount};
 
 			List	   *leafFields = GetLeafFieldsForTable(relationId);
-			ColumnStatsCollector *statsCollector = PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
+			StatsCollector *statsCollector = PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
 																			deleteFile, newDataFilePath, compression,
 																			schema, &stats, leafFields);
 
@@ -580,7 +580,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			List *leafFields = GetLeafFieldsForTable(relationId);
 			/* write the deletion file */
-			ColumnStatsCollector *statsCollector =
+			StatsCollector *statsCollector =
 				ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
 								 DATA_FORMAT_PARQUET, compression, copyOptions, schema, leafFields);
 
@@ -986,7 +986,7 @@ PrepareToAddQueryResultToTable(Oid relationId, char *readQuery, TupleDesc queryT
 
 	/* perform compaction */
 	List	   *leafFields = GetLeafFieldsForTable(relationId);
-	ColumnStatsCollector *statsCollector =
+	StatsCollector *statsCollector =
 		WriteQueryResultTo(readQuery,
 						   newDataFilePath,
 						   properties.format,

From fa8ae419d9cf67cf25ef98f51a4c7d8f5882248c Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Fri, 9 Jan 2026 15:35:17 +0300
Subject: [PATCH 43/46] Reindent

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 .../pg_lake/data_file/data_file_stats.h       | 22 ++++++------
 .../include/pg_lake/parquet/leaf_field.h      |  2 +-
 .../include/pg_lake/pgduck/delete_data.h      | 16 ++++-----
 .../include/pg_lake/pgduck/write_data.h       | 36 +++++++++----------
 .../src/data_file/data_file_stats.c           | 17 +++++----
 pg_lake_engine/src/data_file/data_files.c     |  4 +--
 pg_lake_engine/src/parquet/leaf_field.c       |  3 +-
 pg_lake_engine/src/pgduck/write_data.c        |  1 +
 .../include/pg_lake/iceberg/data_file_stats.h |  4 +--
 pg_lake_iceberg/src/iceberg/data_file_stats.c |  4 +--
 pg_lake_iceberg/src/iceberg/iceberg_field.c   |  1 -
 pg_lake_iceberg/src/init.c                    |  2 +-
 pg_lake_table/src/fdw/data_file_stats.c       |  1 +
 pg_lake_table/src/fdw/writable_table.c        | 25 +++++++------
 14 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index 0497ee91..b41305e8 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -67,17 +67,17 @@ typedef struct DataFileStats
 
 typedef struct StatsCollector
 {
-	int64 totalRowCount;
-	List *dataFileStats;
-} StatsCollector;
+	int64		totalRowCount;
+	List	   *dataFileStats;
+}			StatsCollector;
 
 extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * stats);
-extern PGDLLEXPORT StatsCollector *GetDataFileStatsListFromPGResult(PGresult *result,
-																		  List *leafFields,
-																		  DataFileSchema * schema);
-extern PGDLLEXPORT StatsCollector *ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
-																			  List *leafFields,
-																			  DataFileSchema * schema,
-																			  bool disablePreserveInsertionOrder,
-																			  CopyDataFormat destinationFormat);
+extern PGDLLEXPORT StatsCollector * GetDataFileStatsListFromPGResult(PGresult *result,
+																	 List *leafFields,
+																	 DataFileSchema * schema);
+extern PGDLLEXPORT StatsCollector * ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
+																		 List *leafFields,
+																		 DataFileSchema * schema,
+																		 bool disablePreserveInsertionOrder,
+																		 CopyDataFormat destinationFormat);
 extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
diff --git a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
index d77e5b69..085ee52f 100644
--- a/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
+++ b/pg_lake_engine/include/pg_lake/parquet/leaf_field.h
@@ -52,7 +52,7 @@ typedef struct LeafField
 extern PGDLLEXPORT int LeafFieldCompare(const ListCell *a, const ListCell *b);
 extern PGDLLEXPORT bool SchemaFieldsEquivalent(DataFileSchemaField * fieldA, DataFileSchemaField * fieldB);
 extern PGDLLEXPORT LeafField DeepCopyLeafField(const LeafField * leafField);
-extern PGDLLEXPORT LeafField *FindLeafField(List *leafFieldList, int fieldId);
+extern PGDLLEXPORT LeafField * FindLeafField(List *leafFieldList, int fieldId);
 #if PG_VERSION_NUM < 170000
 extern PGDLLEXPORT int pg_cmp_s32(int32 a, int32 b);
 #endif
diff --git a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
index a21d284e..9841286f 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/delete_data.h
@@ -24,11 +24,11 @@
 #include "pg_lake/pgduck/read_data.h"
 #include "pg_lake/data_file/data_file_stats.h"
 
-extern PGDLLEXPORT StatsCollector *PerformDeleteFromParquet(char *sourceDataFilePath,
-																  List *positionDeleteFiles,
-																  char *deletionFilePath,
-																  char *destinationPath,
-																  CopyDataCompression destinationCompression,
-																  DataFileSchema * schema,
-																  ReadDataStats * stats,
-																  List *leafFields);
+extern PGDLLEXPORT StatsCollector * PerformDeleteFromParquet(char *sourceDataFilePath,
+															 List *positionDeleteFiles,
+															 char *deletionFilePath,
+															 char *destinationPath,
+															 CopyDataCompression destinationCompression,
+															 DataFileSchema * schema,
+															 ReadDataStats * stats,
+															 List *leafFields);
diff --git a/pg_lake_engine/include/pg_lake/pgduck/write_data.h b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
index 7084340b..c6354f4b 100644
--- a/pg_lake_engine/include/pg_lake/pgduck/write_data.h
+++ b/pg_lake_engine/include/pg_lake/pgduck/write_data.h
@@ -36,22 +36,22 @@ typedef enum ParquetVersion
 /* pg_lake_table.default_parquet_version */
 extern PGDLLEXPORT int DefaultParquetVersion;
 
-extern PGDLLEXPORT StatsCollector *ConvertCSVFileTo(char *csvFilePath,
-														  TupleDesc tupleDesc,
-														  int maxLineSize,
-														  char *destinationPath,
-														  CopyDataFormat destinationFormat,
-														  CopyDataCompression destinationCompression,
-														  List *formatOptions,
-														  DataFileSchema * schema,
-														  List *leafFields);
-extern PGDLLEXPORT StatsCollector *WriteQueryResultTo(char *query,
-															char *destinationPath,
-															CopyDataFormat destinationFormat,
-															CopyDataCompression destinationCompression,
-															List *formatOptions,
-															bool queryHasRowId,
-															DataFileSchema * schema,
-															TupleDesc queryTupleDesc,
-															List *leafFields);
+extern PGDLLEXPORT StatsCollector * ConvertCSVFileTo(char *csvFilePath,
+													 TupleDesc tupleDesc,
+													 int maxLineSize,
+													 char *destinationPath,
+													 CopyDataFormat destinationFormat,
+													 CopyDataCompression destinationCompression,
+													 List *formatOptions,
+													 DataFileSchema * schema,
+													 List *leafFields);
+extern PGDLLEXPORT StatsCollector * WriteQueryResultTo(char *query,
+													   char *destinationPath,
+													   CopyDataFormat destinationFormat,
+													   CopyDataCompression destinationCompression,
+													   List *formatOptions,
+													   bool queryHasRowId,
+													   DataFileSchema * schema,
+													   TupleDesc queryTupleDesc,
+													   List *leafFields);
 extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index 960e64f5..5ff787c0 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -32,7 +32,7 @@ static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, Lis
 static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
 static const char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
-static int FindIndexInStringList(List *names, const char *targetName);
+static int	FindIndexInStringList(List *names, const char *targetName);
 
 
 /*
@@ -70,6 +70,7 @@ ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 		else
 		{
 			char	   *commandTuples = PQcmdTuples(result);
+
 			statsCollector = palloc0(sizeof(StatsCollector));
 			statsCollector->totalRowCount = atoll(commandTuples);
 			statsCollector->dataFileStats = NIL;
@@ -107,7 +108,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 
 	int			resultRowCount = PQntuples(result);
 	int			resultColumnCount = PQnfields(result);
-	int64 totalRowCount = 0;
+	int64		totalRowCount = 0;
 
 	for (int resultRowIndex = 0; resultRowIndex < resultRowCount; resultRowIndex++)
 	{
@@ -146,6 +147,7 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 	}
 
 	StatsCollector *statsCollector = palloc0(sizeof(StatsCollector));
+
 	statsCollector->totalRowCount = totalRowCount;
 	statsCollector->dataFileStats = statsList;
 
@@ -367,6 +369,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 		int			fieldId = field->id;
 
 		int			nameIndex = FindIndexInStringList(names, fieldName);
+
 		if (nameIndex == -1)
 		{
 			ereport(DEBUG3, (errmsg("field with name %s not found in stats output, skipping", fieldName)));
@@ -380,7 +383,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
 			continue;
 		}
-		else if(ShouldSkipStatistics(leafField))
+		else if (ShouldSkipStatistics(leafField))
 		{
 			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
 			continue;
@@ -408,7 +411,7 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 static int
 FindIndexInStringList(List *names, const char *targetName)
 {
-	for(int index = 0; index < list_length(names); index++)
+	for (int index = 0; index < list_length(names); index++)
 	{
 		if (strcmp(list_nth(names, index), targetName) == 0)
 		{
@@ -466,9 +469,9 @@ ShouldSkipStatistics(LeafField * leafField)
 	{
 		/*
 		 * We currently do not support pruning on array, map and composite
-		 * types. So there's no need to collect stats for them. Note that
-		 * in the past we did collect, and have some tests commented out,
-		 * such as skippedtest_pg_lake_iceberg_table_complex_values.
+		 * types. So there's no need to collect stats for them. Note that in
+		 * the past we did collect, and have some tests commented out, such as
+		 * skippedtest_pg_lake_iceberg_table_complex_values.
 		 */
 		return true;
 	}
diff --git a/pg_lake_engine/src/data_file/data_files.c b/pg_lake_engine/src/data_file/data_files.c
index 526dd11f..189667af 100644
--- a/pg_lake_engine/src/data_file/data_files.c
+++ b/pg_lake_engine/src/data_file/data_files.c
@@ -139,8 +139,8 @@ DeepCopyDataFileStats(const DataFileStats * stats)
 			DataFileColumnStats *copiedColStats = palloc0(sizeof(DataFileColumnStats));
 
 			copiedColStats->leafField = DeepCopyLeafField(&colStats->leafField);
-			copiedColStats->lowerBoundText = colStats->lowerBoundText ? pstrdup(colStats->lowerBoundText): NULL;
-			copiedColStats->upperBoundText = colStats->upperBoundText ? pstrdup(colStats->upperBoundText): NULL;
+			copiedColStats->lowerBoundText = colStats->lowerBoundText ? pstrdup(colStats->lowerBoundText) : NULL;
+			copiedColStats->upperBoundText = colStats->upperBoundText ? pstrdup(colStats->upperBoundText) : NULL;
 
 			copiedStats->columnStats = lappend(copiedStats->columnStats, copiedColStats);
 		}
diff --git a/pg_lake_engine/src/parquet/leaf_field.c b/pg_lake_engine/src/parquet/leaf_field.c
index c387b37b..38c81f1f 100644
--- a/pg_lake_engine/src/parquet/leaf_field.c
+++ b/pg_lake_engine/src/parquet/leaf_field.c
@@ -30,7 +30,7 @@ LeafField
 DeepCopyLeafField(const LeafField * leafField)
 {
 	LeafField  *copiedLeafField = palloc0(sizeof(LeafField));
-	
+
 	copiedLeafField->fieldId = leafField->fieldId;
 	copiedLeafField->field = DeepCopyField(leafField->field);
 	copiedLeafField->duckTypeName = pstrdup(leafField->duckTypeName);
@@ -48,6 +48,7 @@ LeafField *
 FindLeafField(List *leafFieldList, int fieldId)
 {
 	ListCell   *cell = NULL;
+
 	foreach(cell, leafFieldList)
 	{
 		LeafField  *leafField = (LeafField *) lfirst(cell);
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 8fb997a5..0bd01eae 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -393,6 +393,7 @@ WriteQueryResultTo(char *query,
 	appendStringInfoString(&command, ")");
 
 	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
+
 	return ExecuteCopyCommandOnPGDuckConnection(command.data,
 												leafFields,
 												schema,
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/data_file_stats.h b/pg_lake_iceberg/include/pg_lake/iceberg/data_file_stats.h
index 8346c3bc..917bff93 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/data_file_stats.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/data_file_stats.h
@@ -23,8 +23,8 @@
 #include "pg_lake/iceberg/api.h"
 
 extern PGDLLEXPORT void SetIcebergDataFileStats(const DataFileStats * dataFileStats,
-												int64_t * recordCount,
-												int64_t * fileSizeInBytes,
+												int64_t *recordCount,
+												int64_t *fileSizeInBytes,
 												ColumnBound * *lowerBounds,
 												size_t *nLowerBounds,
 												ColumnBound * *upperBounds,
diff --git a/pg_lake_iceberg/src/iceberg/data_file_stats.c b/pg_lake_iceberg/src/iceberg/data_file_stats.c
index 0f8ea24c..00b8556c 100644
--- a/pg_lake_iceberg/src/iceberg/data_file_stats.c
+++ b/pg_lake_iceberg/src/iceberg/data_file_stats.c
@@ -38,8 +38,8 @@ static ColumnBound * CreateColumnBoundForLeafField(LeafField * leafField, char *
  */
 void
 SetIcebergDataFileStats(const DataFileStats * dataFileStats,
-						int64_t * recordCount,
-						int64_t * fileSizeInBytes,
+						int64_t *recordCount,
+						int64_t *fileSizeInBytes,
 						ColumnBound * *lowerBounds,
 						size_t *nLowerBounds,
 						ColumnBound * *upperBounds,
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 58e2e23f..d792ceef 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -1257,4 +1257,3 @@ GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatList)
 	PQclear(result);
 	return columnStatsList;
 }
-
diff --git a/pg_lake_iceberg/src/init.c b/pg_lake_iceberg/src/init.c
index a929d6ca..c2c81be6 100644
--- a/pg_lake_iceberg/src/init.c
+++ b/pg_lake_iceberg/src/init.c
@@ -237,7 +237,7 @@ _PG_init(void)
 							   GUC_SUPERUSER_ONLY | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
 							   NULL, NULL, NULL);
 
-    DefineCustomStringVariable("pg_lake_iceberg.rest_catalog_oauth_host_path",
+	DefineCustomStringVariable("pg_lake_iceberg.rest_catalog_oauth_host_path",
 							   NULL,
 							   NULL,
 							   &RestCatalogOauthHostPath,
diff --git a/pg_lake_table/src/fdw/data_file_stats.c b/pg_lake_table/src/fdw/data_file_stats.c
index 5767b78b..174074b2 100644
--- a/pg_lake_table/src/fdw/data_file_stats.c
+++ b/pg_lake_table/src/fdw/data_file_stats.c
@@ -156,6 +156,7 @@ ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats)
 		DataFileStats *dataFileStats = lfirst(dataFileStatsCell);
 
 		ListCell   *columnStatsCell = NULL;
+
 		foreach(columnStatsCell, dataFileStats->columnStats)
 		{
 			DataFileColumnStats *columnStats = lfirst(columnStatsCell);
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index bd589620..b314e248 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -101,8 +101,8 @@ static List *ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCoun
 							 int64 liveRowCount, char *deleteFile, int64 deletedRowCount);
 static List *GetDataFilePathsFromStatsList(List *dataFileStats);
 static List *GetNewFileOpsFromFileStats(Oid relationId, List *dataFileStats,
-									int32 partitionSpecId, Partition * partition, int64 rowCount,
-									bool isVerbose, List **newFiles);
+										int32 partitionSpecId, Partition * partition, int64 rowCount,
+										bool isVerbose, List **newFiles);
 static bool ShouldRewriteAfterDeletions(int64 sourceRowCount, uint64 totalDeletedRowCount);
 static CompactionDataFileHashEntry * GetPartitionWithMostEligibleFiles(Oid relationId, TimestampTz compactionStartTime,
 																	   bool forceMerge);
@@ -275,6 +275,7 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 	if (!splitFilesBySize && statsCollector->dataFileStats == NIL)
 	{
 		DataFileStats *stats = palloc0(sizeof(DataFileStats));
+
 		stats->dataFilePath = dataFilePrefix;
 		stats->rowCount = rowCount;
 		statsCollector->dataFileStats = list_make1(stats);
@@ -297,6 +298,7 @@ PrepareCSVInsertion(Oid relationId, char *insertCSV, int64 rowCount,
 		DataFileStats *stats = lfirst(dataFileStatsCell);
 
 		DataFileModification *modification = palloc0(sizeof(DataFileModification));
+
 		modification->type = ADD_DATA_FILE;
 		modification->insertFile = stats->dataFilePath;
 		modification->insertedRowCount = stats->rowCount;
@@ -340,7 +342,7 @@ GetDataFilePathsFromStatsList(List *dataFileStats)
  */
 static List *
 GetNewFileOpsFromFileStats(Oid relationId, List *dataFileStats, int32 partitionSpecId, Partition * partition,
-					   int64 rowCount, bool isVerbose, List **newFiles)
+						   int64 rowCount, bool isVerbose, List **newFiles)
 {
 	*newFiles = NIL;
 
@@ -522,8 +524,8 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			List	   *leafFields = GetLeafFieldsForTable(relationId);
 			StatsCollector *statsCollector = PerformDeleteFromParquet(sourcePath, existingPositionDeletes,
-																			deleteFile, newDataFilePath, compression,
-																			schema, &stats, leafFields);
+																	  deleteFile, newDataFilePath, compression,
+																	  schema, &stats, leafFields);
 
 			ApplyColumnStatsModeForAllFileStats(relationId, statsCollector->dataFileStats);
 
@@ -544,8 +546,9 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Assert(statsCollector->dataFileStats != NIL);
 
 			/*
-			 * while deleting from parquet, we do not add file_size_bytes option to COPY command,
-			 * so we can assume that we'll have only a single file. 
+			 * while deleting from parquet, we do not add file_size_bytes
+			 * option to COPY command, so we can assume that we'll have only a
+			 * single file.
 			 */
 			DataFileStats *newFileStats = linitial(statsCollector->dataFileStats);
 
@@ -578,7 +581,8 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 
 			InsertInProgressFileRecordExtended(deletionFilePath, isPrefix, deferDeletion);
 
-			List *leafFields = GetLeafFieldsForTable(relationId);
+			List	   *leafFields = GetLeafFieldsForTable(relationId);
+
 			/* write the deletion file */
 			StatsCollector *statsCollector =
 				ConvertCSVFileTo(deleteFile, deleteTupleDesc, -1, deletionFilePath,
@@ -588,9 +592,10 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 										   deletionFilePath, deletedRowCount)));
 
 			/*
-			 * ConvertCSVFileTo() does not use file_bytes_size so we can assume single file 
+			 * ConvertCSVFileTo() does not use file_bytes_size so we can
+			 * assume single file
 			 */
-            Assert(list_length(statsCollector->dataFileStats) == 1);
+			Assert(list_length(statsCollector->dataFileStats) == 1);
 			DataFileStats *deletionFileStats = linitial(statsCollector->dataFileStats);
 
 			/*

From 699eb3f04bb98ad68a27d3c782d68b6dbf562901 Mon Sep 17 00:00:00 2001
From: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
Date: Fri, 9 Jan 2026 15:40:07 +0300
Subject: [PATCH 44/46] generate stats for all files at WriteQueryResultTo

Signed-off-by: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
---
 .../pg_lake/data_file/data_file_stats.h       |  45 +-
 .../src/data_file/data_file_stats.c           | 756 +++++++++++++++++-
 pg_lake_engine/src/pgduck/delete_data.c       |  11 +-
 pg_lake_engine/src/pgduck/write_data.c        |  11 +-
 .../include/pg_lake/iceberg/iceberg_field.h   |   1 -
 pg_lake_iceberg/src/iceberg/iceberg_field.c   | 384 ---------
 .../include/pg_lake/fdw/data_file_stats.h     |  57 --
 pg_lake_table/src/fdw/data_file_stats.c       | 429 ----------
 .../src/fdw/data_file_stats_catalog.c         |   2 +-
 pg_lake_table/src/fdw/data_files_catalog.c    |  37 +-
 pg_lake_table/src/fdw/multi_data_file_dest.c  |  12 +-
 pg_lake_table/src/fdw/writable_table.c        |   2 +-
 pg_lake_table/src/test/add_files_to_table.c   |   5 +-
 13 files changed, 840 insertions(+), 912 deletions(-)
 delete mode 100644 pg_lake_table/include/pg_lake/fdw/data_file_stats.h
 delete mode 100644 pg_lake_table/src/fdw/data_file_stats.c

diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
index b41305e8..65736932 100644
--- a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
+++ b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -24,6 +24,35 @@
 #include "pg_lake/parquet/leaf_field.h"
 #include "pg_lake/pgduck/client.h"
 
+
+/*
+ * ColumnStatsMode describes the mode of column stats.
+ * - When truncate mode (default) is used, the column stats are truncated
+ *   to the given length.
+ * - When none mode is used, the column stats are not collected.
+ */
+typedef enum ColumnStatsMode
+{
+	COLUMN_STATS_MODE_TRUNCATE = 0,
+	COLUMN_STATS_MODE_NONE = 1,
+}			ColumnStatsMode;
+
+/*
+ * ColumnStatsConfig describes the configuration for column stats.
+ * - mode: the mode of column stats.
+ * - truncateLen: the length to truncate the column stats in truncate mode.
+ */
+typedef struct ColumnStatsConfig
+{
+	ColumnStatsMode mode;
+
+	/* used for truncate mode */
+	size_t		truncateLen;
+}			ColumnStatsConfig;
+
+
+
+
  /*
   * DataFileColumnStats stores column statistics for a data file.
   */
@@ -75,9 +104,15 @@ extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * s
 extern PGDLLEXPORT StatsCollector * GetDataFileStatsListFromPGResult(PGresult *result,
 																	 List *leafFields,
 																	 DataFileSchema * schema);
-extern PGDLLEXPORT StatsCollector * ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
-																		 List *leafFields,
-																		 DataFileSchema * schema,
-																		 bool disablePreserveInsertionOrder,
-																		 CopyDataFormat destinationFormat);
+extern PGDLLEXPORT StatsCollector * ExecuteCopyToCommandOnPGDuckConnection(char *copyCommand,
+																		   List *leafFields,
+																		   DataFileSchema * schema,
+																		   bool disablePreserveInsertionOrder,
+																		   char *destinationPath,
+																		   CopyDataFormat destinationFormat);
 extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
+extern PGDLLEXPORT DataFileStats * CreateDataFileStatsForDataFile(char *dataFilePath,
+																  int64 rowCount, int64 deletedRowCount,
+																  List *leafFields);
+extern PGDLLEXPORT void ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats);
+extern PGDLLEXPORT List *GetRemoteParquetColumnStats(char *path, List *leafFields);
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index 5ff787c0..e7a25b1e 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -21,10 +21,16 @@
 #include "pg_lake/data_file/data_files.h"
 #include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/extensions/postgis.h"
+#include "pg_lake/parsetree/options.h"
 #include "pg_lake/pgduck/client.h"
 #include "pg_lake/pgduck/map.h"
+#include "pg_lake/pgduck/remote_storage.h"
+#include "pg_lake/pgduck/serialize.h"
+#include "commands/defrem.h"
+#include "foreign/foreign.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
+#include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 
 static void ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs);
@@ -33,18 +39,52 @@ static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names,
 static const char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
 static int	FindIndexInStringList(List *names, const char *targetName);
+static List *FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path);
+static char *PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList);
+static char *SerializeTextArrayTypeToPgDuck(ArrayType *array);
+static ArrayType *ReadArrayFromText(char *arrayText);
+static List *GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatsList);
+static ColumnStatsConfig GetColumnStatsConfig(Oid relationId);
+static void ApplyColumnStatsModeForType(ColumnStatsConfig columnStatsConfig,
+										PGType pgType, char **lowerBoundText,
+										char **upperBoundText);
+static char *TruncateStatsMinForText(char *lowerBound, size_t truncateLen);
+static char *TruncateStatsMaxForText(char *upperBound, size_t truncateLen);
+static bytea *TruncateStatsMinForBinary(bytea *lowerBound, size_t truncateLen);
+static bytea *TruncateStatsMaxForBinary(bytea *upperBound, size_t truncateLen);
+static Datum ColumnStatsTextToDatum(char *text, PGType pgType);
+static char *DatumToColumnStatsText(Datum datum, PGType pgType, bool isNull);
 
 
 /*
- * ExecuteCopyCommandOnPGDuckConnection executes the given COPY command on
+* The output is in the format of:
+*     field_id, ARRAY[val1, val2, val3.., valN]
+*
+* The array values are NOT yet sorted, they are the stats_min and stats_max values
+* from the parquet metadata. We put min and max values in the same array to because
+* we want the global ordering of the values, not per row group.
+*
+* Also note that the values are in string format, and need to be converted to the
+* appropriate type before being sorted.
+*/
+typedef struct RowGroupStats
+{
+	LeafField  *leafField;
+	ArrayType  *minMaxArray;
+}			RowGroupStats;
+
+
+/*
+ * ExecuteCopyToCommandOnPGDuckConnection executes the given COPY TO command on
  * a PGDuck connection and returns a StatsCollector.
  */
 StatsCollector *
-ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
-									 List *leafFields,
-									 DataFileSchema * schema,
-									 bool disablePreserveInsertionOrder,
-									 CopyDataFormat destinationFormat)
+ExecuteCopyToCommandOnPGDuckConnection(char *copyCommand,
+									   List *leafFields,
+									   DataFileSchema * schema,
+									   bool disablePreserveInsertionOrder,
+									   char *destinationPath,
+									   CopyDataFormat destinationFormat)
 {
 	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
 	PGresult   *result;
@@ -70,10 +110,16 @@ ExecuteCopyCommandOnPGDuckConnection(char *copyCommand,
 		else
 		{
 			char	   *commandTuples = PQcmdTuples(result);
+			int64		totalRowCount = atoll(commandTuples);
+
+			DataFileStats *fileStats = CreateDataFileStatsForDataFile(destinationPath,
+																	  totalRowCount,
+																	  0,
+																	  leafFields);
 
 			statsCollector = palloc0(sizeof(StatsCollector));
-			statsCollector->totalRowCount = atoll(commandTuples);
-			statsCollector->dataFileStats = NIL;
+			statsCollector->totalRowCount = totalRowCount;
+			statsCollector->dataFileStats = list_make1(fileStats);
 		}
 
 		PQclear(result);
@@ -478,3 +524,697 @@ ShouldSkipStatistics(LeafField * leafField)
 
 	return false;
 }
+
+
+/*
+  * GetRemoteParquetColumnStats gets the stats for each leaf field
+  * in a remote Parquet file.
+  */
+List *
+GetRemoteParquetColumnStats(char *path, List *leafFields)
+{
+	if (list_length(leafFields) == 0)
+	{
+		/*
+		 * short circuit for empty list, otherwise need to adjust the below
+		 * query
+		 */
+		return NIL;
+	}
+
+	/*
+	 * Sort the leaf fields by fieldId, and then use ORDER BY in the query to
+	 * ensure that the results are in the same order as the input list.
+	 */
+	List	   *leafFieldsCopy = list_copy(leafFields);
+
+	list_sort(leafFieldsCopy, LeafFieldCompare);
+
+	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
+
+	List	   *rowGroupStatsList = FetchRowGroupStats(pgDuckConn, leafFieldsCopy, path);
+
+	if (list_length(rowGroupStatsList) == 0)
+	{
+		/* no stats available */
+		ReleasePGDuckConnection(pgDuckConn);
+		return NIL;
+	}
+
+	List	   *columnStatsList = GetFieldMinMaxStats(pgDuckConn, rowGroupStatsList);
+
+	ReleasePGDuckConnection(pgDuckConn);
+	return columnStatsList;
+}
+
+
+/*
+* FetchRowGroupStats fetches the statistics for the given leaf fields.
+* The output is in the format of:
+*     field_id, ARRAY[val1, val2, val3.., valN]
+*     field_id, ARRAY[val1, val2, val3.., valN]
+*    ...
+* The array values are NOT yet sorted, they are the stats_min and stats_max values
+* from the parquet metadata. We put min and max values in the same array to because
+* we want the global ordering of the values, not per row group.
+*
+* Also note that the values are in string format, and need to be converted to the
+* appropriate type before being sorted.
+*
+* The output is sorted by the input fieldIdList.
+*/
+static List *
+FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path)
+{
+	List	   *rowGroupStatsList = NIL;
+
+	StringInfo	query = makeStringInfo();
+
+	appendStringInfo(query,
+
+	/*
+	 * column_id_field_id_mapping: maps the column_id to the field_id for all
+	 * the leaf fields. We come up with this mapping by checking the DuckDB
+	 * source code, we should be careful if they ever break this assumption.
+	 */
+					 "WITH column_id_field_id_mapping AS ( "
+					 "	SELECT row_number() OVER () - 1 AS column_id, field_id "
+					 "	FROM parquet_schema(%s)   "
+					 "	WHERE num_children IS NULL and field_id <> "
+					 PG_LAKE_TOSTRING(ICEBERG_ROWID_FIELD_ID)
+					 "), "
+
+	/*
+	 * Fetch the parquet metadata per column_id. For each column_id, we may
+	 * get multiple row groups, and we need to aggregate the stats_min and
+	 * stats_max values for each column_id.
+	 */
+					 "parquet_metadata AS ( "
+					 "		SELECT column_id, stats_min, stats_min_value, stats_max, stats_max_value "
+					 "		FROM parquet_metadata(%s)), "
+
+	/*
+	 * Now, we aggregate the stats_min and stats_max values for each
+	 * column_id. Note that we use the coalesce function to handle the case
+	 * where stats_min is NULL, and we use the stats_min_value instead. We
+	 * currently don't have a good grasp on when DuckDB uses stats_min vs
+	 * stats_min_value, so we use both. Typically both is set to the same
+	 * value, but we want to be safe. We use the array_agg function to collect
+	 * all the min/max values into an array, and values are not casted to the
+	 * appropriate type yet, we create a text array. Finding min/max values
+	 * for different data types in the same query is tricky as there is no
+	 * support for casting to a type with a dynamic type name. So, doing it in
+	 * two queries is easier to understand/maintain.
+	 */
+					 "row_group_aggs AS ( "
+					 "SELECT c.field_id,  "
+					 "       array_agg(CAST(coalesce(m.stats_min, m.stats_min_value) AS TEXT)) "
+					 "                 FILTER (WHERE m.stats_min IS NOT NULL OR m.stats_min_value IS NOT NULL) || "
+					 "       array_agg(CAST(coalesce(m.stats_max, m.stats_max_value) AS TEXT)) "
+					 "                  FILTER (WHERE m.stats_max IS NOT NULL OR m.stats_max_value IS NOT NULL)  AS values "
+					 "FROM column_id_field_id_mapping c "
+					 "JOIN parquet_metadata m USING (column_id) "
+					 "GROUP BY c.field_id) "
+					 "SELECT field_id, values FROM row_group_aggs ORDER BY field_id;",
+					 quote_literal_cstr(path), quote_literal_cstr(path));
+
+	PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, query->data);
+
+	/* throw error if anything failed  */
+	CheckPGDuckResult(pgDuckConn, result);
+
+	/* make sure we PQclear the result */
+	PG_TRY();
+	{
+		int			rowCount = PQntuples(result);
+
+		for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
+		{
+			if (PQgetisnull(result, rowIndex, 0))
+			{
+				/* the data file doesn't have field id */
+				continue;
+			}
+
+			int			fieldId = atoi(PQgetvalue(result, rowIndex, 0));
+			LeafField  *leafField = FindLeafField(fieldIdList, fieldId);
+
+			if (leafField == NULL)
+				/* dropped column for external iceberg tables */
+				continue;
+
+			if (ShouldSkipStatistics(leafField))
+				continue;
+
+			char	   *minMaxArrayText = NULL;
+
+			if (!PQgetisnull(result, rowIndex, 1))
+			{
+				minMaxArrayText = pstrdup(PQgetvalue(result, rowIndex, 1));
+			}
+
+			RowGroupStats *rowGroupStats = palloc0(sizeof(RowGroupStats));
+
+			rowGroupStats->leafField = leafField;
+			rowGroupStats->minMaxArray = minMaxArrayText ? ReadArrayFromText(minMaxArrayText) : NULL;
+
+			rowGroupStatsList = lappend(rowGroupStatsList, rowGroupStats);
+		}
+	}
+	PG_CATCH();
+	{
+		PQclear(result);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	PQclear(result);
+
+	return rowGroupStatsList;
+}
+
+
+/*
+* For the given rowGroupStatList, prepare the query to get the min and max values
+* for each field. In the end, we will have a query like:
+* 		SELECT 1,
+*			   list_aggregate(CAST(min_max_array AS type[]), 'min') as field_1_min,
+*			   list_aggregate(CAST(min_max_array AS type[]), 'max') as field_1_max,
+*			   2,
+*			   list_aggregate(CAST(min_max_array AS type[]), 'min') as field_2_min,
+*			   list_aggregate(CAST(min_max_array AS type[]), 'max') as field_2_max,
+*			   ...
+* We are essentially aggregating the min and max values for each field in the same query. This scales
+* better than UNION ALL queries for each field.
+*/
+static char *
+PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList)
+{
+	StringInfo	query = makeStringInfo();
+
+	ListCell   *lc;
+
+	appendStringInfo(query, "SELECT ");
+
+	foreach(lc, rowGroupStatList)
+	{
+		RowGroupStats *rowGroupStats = lfirst(lc);
+		LeafField  *leafField = rowGroupStats->leafField;
+		int			fieldId = leafField->fieldId;
+
+		if (rowGroupStats->minMaxArray != NULL)
+		{
+			char	   *reserializedArray = SerializeTextArrayTypeToPgDuck(rowGroupStats->minMaxArray);
+
+			appendStringInfo(query, " %d, list_aggregate(CAST(%s AS %s[]), 'min') as field_%d_min, "
+							 "list_aggregate(CAST(%s AS %s[]), 'max')  as field_%d_min, ",
+							 fieldId,
+							 quote_literal_cstr(reserializedArray), leafField->duckTypeName, fieldId,
+							 quote_literal_cstr(reserializedArray), leafField->duckTypeName, fieldId);
+		}
+		else
+		{
+			appendStringInfo(query, " %d, NULL  as field_%d_min, NULL  as field_%d_min, ", fieldId, fieldId, fieldId);
+		}
+	}
+
+	return query->data;
+}
+
+
+/*
+* The input array is in the format of {val1, val2, val3, ..., valN},
+* and element type is text. Serialize it to text in DuckDB format.
+*/
+static char *
+SerializeTextArrayTypeToPgDuck(ArrayType *array)
+{
+	Datum		arrayDatum = PointerGetDatum(array);
+
+	FmgrInfo	outFunc;
+	Oid			outFuncId = InvalidOid;
+	bool		isvarlena = false;
+
+	getTypeOutputInfo(TEXTARRAYOID, &outFuncId, &isvarlena);
+	fmgr_info(outFuncId, &outFunc);
+
+	return PGDuckSerialize(&outFunc, TEXTARRAYOID, arrayDatum);
+}
+
+
+/*
+* ReadArrayFromText reads the array from the given text.
+*/
+static ArrayType *
+ReadArrayFromText(char *arrayText)
+{
+	Oid			funcOid = F_ARRAY_IN;
+
+	FmgrInfo	flinfo;
+
+	fmgr_info(funcOid, &flinfo);
+
+	/* array in has 3 arguments */
+	LOCAL_FCINFO(fcinfo, 3);
+
+	InitFunctionCallInfoData(*fcinfo,
+							 &flinfo,
+							 3,
+							 InvalidOid,
+							 NULL,
+							 NULL);
+
+	fcinfo->args[0].value = CStringGetDatum(arrayText);
+	fcinfo->args[0].isnull = false;
+
+	fcinfo->args[1].value = ObjectIdGetDatum(TEXTOID);
+	fcinfo->args[1].isnull = false;
+
+	fcinfo->args[2].value = Int32GetDatum(-1);
+	fcinfo->args[2].isnull = false;
+
+	Datum		result = FunctionCallInvoke(fcinfo);
+
+	if (fcinfo->isnull)
+	{
+		/* not expected given we only call this for non-null text */
+		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+						errmsg("could not reserialize text array")));
+	}
+
+	return DatumGetArrayTypeP(result);
+}
+
+/*
+* GetFieldMinMaxStats gets the min and max values for each field in the given rowGroupedStatList.
+* In this function, we create a query where we first cast the minMaxArray to the appropriate type
+* and then aggregate the min and max values for each field.
+*/
+static List *
+GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatList)
+{
+	char	   *query = PrepareRowGroupStatsMinMaxQuery(rowGroupStatList);
+
+	PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, query);
+
+	/* throw error if anything failed  */
+	CheckPGDuckResult(pgDuckConn, result);
+
+	List	   *columnStatsList = NIL;
+
+#ifdef USE_ASSERT_CHECKING
+
+	/*
+	 * We never omit any entries from the rowGroupStatList, and for each
+	 * rowGroupStatList entry, we have 3 columns: fieldId, minValue and
+	 * maxValue.
+	 */
+	int			rowGroupLength = list_length(rowGroupStatList);
+
+	Assert(PQnfields(result) == rowGroupLength * 3);
+#endif
+
+	PG_TRY();
+	{
+		for (int columnIndex = 0; columnIndex < PQnfields(result); columnIndex = columnIndex + 3)
+		{
+			DataFileColumnStats *columnStats = palloc0(sizeof(DataFileColumnStats));
+			int			rowGroupIndex = columnIndex / 3;
+
+			RowGroupStats *rowGroupStats = list_nth(rowGroupStatList, rowGroupIndex);
+			LeafField  *leafField = rowGroupStats->leafField;
+
+#ifdef USE_ASSERT_CHECKING
+			/* we use a sorted rowGroupStatList, so should be */
+			int			fieldId = atoi(PQgetvalue(result, 0, columnIndex));
+
+			Assert(leafField->fieldId == fieldId);
+#endif
+
+			columnStats->leafField = *leafField;
+
+			int			lowerBoundIndex = columnIndex + 1;
+
+			if (!PQgetisnull(result, 0, lowerBoundIndex))
+			{
+				/* the data file doesn't have field id */
+				columnStats->lowerBoundText = pstrdup(PQgetvalue(result, 0, lowerBoundIndex));
+			}
+			else
+				columnStats->lowerBoundText = NULL;
+
+			int			upperBoundIndex = columnIndex + 2;
+
+			if (!PQgetisnull(result, 0, upperBoundIndex))
+			{
+				/* the data file doesn't have field id */
+				columnStats->upperBoundText = pstrdup(PQgetvalue(result, 0, upperBoundIndex));
+			}
+			else
+				columnStats->upperBoundText = NULL;
+
+			columnStatsList = lappend(columnStatsList, columnStats);
+		}
+	}
+	PG_CATCH();
+	{
+		PQclear(result);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	PQclear(result);
+	return columnStatsList;
+}
+
+
+/*
+ * CreateDataFileStatsForDataFile creates the data file stats for the given data file.
+ * It uses already calculated file level stats. And sends remote queries
+ * to the file to extract the column level stats if leafFields is not NIL.
+ */
+DataFileStats *
+CreateDataFileStatsForDataFile(char *dataFilePath, int64 rowCount, int64 deletedRowCount,
+							   List *leafFields)
+{
+
+	List	   *columnStats;
+
+	if (leafFields != NIL)
+		columnStats = GetRemoteParquetColumnStats(dataFilePath, leafFields);
+	else
+		columnStats = NIL;
+
+	int64		fileSize = GetRemoteFileSize(dataFilePath);
+
+	DataFileStats *dataFileStats = palloc0(sizeof(DataFileStats));
+
+	dataFileStats->dataFilePath = dataFilePath;
+	dataFileStats->fileSize = fileSize;
+	dataFileStats->rowCount = rowCount;
+	dataFileStats->deletedRowCount = deletedRowCount;
+	dataFileStats->columnStats = columnStats;
+
+	return dataFileStats;
+}
+
+
+/*
+ * ApplyColumnStatsModeForAllFileStats applies the column stats mode to the given
+ * lower and upper bound text for all file stats.
+ *
+ * e.g. with "truncate(3)"
+ * "abcdef" -> lowerbound: "abc" upperbound: "abd"
+ * "\x010203040506" -> lowerbound: "\x010203" upperbound: "\x010204"
+ *
+ * e.g. with "full"
+ * "abcdef" -> lowerbound: "abcdef" upperbound: "abcdef"
+ * "\x010203040506" -> lowerbound: "\x010203040506" upperbound: "\x010203040506"
+ *
+ * e.g. with "none"
+ * "abcdef" -> lowerbound: NULL upperbound: NULL
+ * "\x010203040506" -> lowerbound: NULL upperbound: NULL
+ */
+void
+ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats)
+{
+	ColumnStatsConfig columnStatsConfig = GetColumnStatsConfig(relationId);
+
+	ListCell   *dataFileStatsCell = NULL;
+
+	foreach(dataFileStatsCell, dataFileStats)
+	{
+		DataFileStats *dataFileStats = lfirst(dataFileStatsCell);
+
+		ListCell   *columnStatsCell = NULL;
+
+		foreach(columnStatsCell, dataFileStats->columnStats)
+		{
+			DataFileColumnStats *columnStats = lfirst(columnStatsCell);
+			char	  **lowerBoundText = &columnStats->lowerBoundText;
+			char	  **upperBoundText = &columnStats->upperBoundText;
+
+			ApplyColumnStatsModeForType(columnStatsConfig, columnStats->leafField.pgType, lowerBoundText, upperBoundText);
+		}
+	}
+}
+
+
+/*
+ * GetColumnStatsConfig returns the column stats config for the given
+ * relation.
+ */
+static ColumnStatsConfig
+GetColumnStatsConfig(Oid relationId)
+{
+	ForeignTable *foreignTable = GetForeignTable(relationId);
+	List	   *options = foreignTable->options;
+	DefElem    *columnStatsModeOption = GetOption(options, "column_stats_mode");
+
+	ColumnStatsConfig config;
+
+	/* default to truncate mode */
+	if (columnStatsModeOption == NULL)
+	{
+		config.mode = COLUMN_STATS_MODE_TRUNCATE;
+		config.truncateLen = 16;
+
+		return config;
+	}
+
+	char	   *columnStatsMode = ToLowerCase(defGetString(columnStatsModeOption));
+
+	if (sscanf(columnStatsMode, "truncate(%zu)", &config.truncateLen) == 1)
+	{
+		config.mode = COLUMN_STATS_MODE_TRUNCATE;
+		if (config.truncateLen > 256)
+			ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+							errmsg("truncate() cannot exceed 256")));
+	}
+	else if (strcmp(columnStatsMode, "full") == 0)
+	{
+		config.mode = COLUMN_STATS_MODE_TRUNCATE;
+		config.truncateLen = 256;
+	}
+	else if (strcmp(columnStatsMode, "none") == 0)
+	{
+		config.mode = COLUMN_STATS_MODE_NONE;
+	}
+	else
+	{
+		/* iceberg fdw validator already validated */
+		pg_unreachable();
+	}
+
+	return config;
+}
+
+
+/*
+ * ApplyColumnStatsModeForType applies the column stats mode to the given lower and upper
+ * bound text for the given pgType.
+ */
+static void
+ApplyColumnStatsModeForType(ColumnStatsConfig columnStatsConfig,
+							PGType pgType, char **lowerBoundText,
+							char **upperBoundText)
+{
+	if (*lowerBoundText == NULL)
+	{
+		return;
+	}
+
+	Assert(*upperBoundText != NULL);
+
+	if (columnStatsConfig.mode == COLUMN_STATS_MODE_TRUNCATE)
+	{
+		size_t		truncateLen = columnStatsConfig.truncateLen;
+
+		/* only text and binary types can be truncated */
+		if (pgType.postgresTypeOid == TEXTOID ||
+			pgType.postgresTypeOid == VARCHAROID ||
+			pgType.postgresTypeOid == BPCHAROID)
+		{
+			*lowerBoundText = TruncateStatsMinForText(*lowerBoundText, truncateLen);
+			Assert(*lowerBoundText != NULL);
+
+			/* could be null if overflow occurred */
+			*upperBoundText = TruncateStatsMaxForText(*upperBoundText, truncateLen);
+		}
+		else if (pgType.postgresTypeOid == BYTEAOID)
+		{
+			/*
+			 * convert from text repr (e.g. '\x0102ef') to bytea to apply
+			 * truncate
+			 */
+			Datum		lowerBoundDatum = ColumnStatsTextToDatum(*lowerBoundText, pgType);
+			Datum		upperBoundDatum = ColumnStatsTextToDatum(*upperBoundText, pgType);
+
+			/* truncate bytea */
+			bytea	   *truncatedLowerBoundBinary = TruncateStatsMinForBinary(DatumGetByteaP(lowerBoundDatum),
+																			  truncateLen);
+			bytea	   *truncatedUpperBoundBinary = TruncateStatsMaxForBinary(DatumGetByteaP(upperBoundDatum),
+																			  truncateLen);
+
+			/* convert bytea back to text representation */
+			Assert(truncatedLowerBoundBinary != NULL);
+			*lowerBoundText = DatumToColumnStatsText(PointerGetDatum(truncatedLowerBoundBinary),
+													 pgType, false);
+
+			/* could be null if overflow occurred */
+			*upperBoundText = DatumToColumnStatsText(PointerGetDatum(truncatedUpperBoundBinary),
+													 pgType, truncatedUpperBoundBinary == NULL);
+		}
+	}
+	else if (columnStatsConfig.mode == COLUMN_STATS_MODE_NONE)
+	{
+		*lowerBoundText = NULL;
+		*upperBoundText = NULL;
+	}
+	else
+	{
+		Assert(false);
+	}
+}
+
+
+/*
+ * TruncateStatsMinForText truncates the given lower bound text to the given length.
+ */
+static char *
+TruncateStatsMinForText(char *lowerBound, size_t truncateLen)
+{
+	if (strlen(lowerBound) <= truncateLen)
+	{
+		return lowerBound;
+	}
+
+	lowerBound[truncateLen] = '\0';
+
+	return lowerBound;
+}
+
+
+/*
+ * TruncateStatsMaxForText truncates the given upper bound text to the given length.
+ */
+static char *
+TruncateStatsMaxForText(char *upperBound, size_t truncateLen)
+{
+	if (strlen(upperBound) <= truncateLen)
+	{
+		return upperBound;
+	}
+
+	upperBound[truncateLen] = '\0';
+
+	/*
+	 * increment the last byte of the upper bound, which does not overflow. If
+	 * not found, return null.
+	 */
+	for (int i = truncateLen - 1; i >= 0; i--)
+	{
+		/* check if overflows max ascii char */
+		/* todo: how to handle utf8 or different encoding? */
+		if (upperBound[i] != INT8_MAX)
+		{
+			upperBound[i]++;
+			return upperBound;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ * TruncateStatsMinForBinary truncates the given lower bound binary to the given length.
+ */
+static bytea *
+TruncateStatsMinForBinary(bytea *lowerBound, size_t truncateLen)
+{
+	size_t		lowerBoundLen = VARSIZE_ANY_EXHDR(lowerBound);
+
+	if (lowerBoundLen <= truncateLen)
+	{
+		return lowerBound;
+	}
+
+	bytea	   *truncatedLowerBound = palloc0(truncateLen + VARHDRSZ);
+
+	SET_VARSIZE(truncatedLowerBound, truncateLen + VARHDRSZ);
+	memcpy(VARDATA_ANY(truncatedLowerBound), VARDATA_ANY(lowerBound), truncateLen);
+
+	return truncatedLowerBound;
+}
+
+
+/*
+ * TruncateStatsMaxForBinary truncates the given upper bound binary to the given length.
+ */
+static bytea *
+TruncateStatsMaxForBinary(bytea *upperBound, size_t truncateLen)
+{
+	size_t		upperBoundLen = VARSIZE_ANY_EXHDR(upperBound);
+
+	if (upperBoundLen <= truncateLen)
+	{
+		return upperBound;
+	}
+
+	bytea	   *truncatedUpperBound = palloc0(truncateLen + VARHDRSZ);
+
+	SET_VARSIZE(truncatedUpperBound, truncateLen + VARHDRSZ);
+	memcpy(VARDATA_ANY(truncatedUpperBound), VARDATA_ANY(upperBound), truncateLen);
+
+	/*
+	 * increment the last byte of the upper bound, which does not overflow. If
+	 * not found, return null.
+	 */
+	for (int i = truncateLen - 1; i >= 0; i--)
+	{
+		/* check if overflows max byte */
+		if ((unsigned char) VARDATA_ANY(truncatedUpperBound)[i] != UINT8_MAX)
+		{
+			VARDATA_ANY(truncatedUpperBound)[i]++;
+			return truncatedUpperBound;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ * ColumnStatsTextToDatum converts the given text to Datum for the given pgType.
+ */
+static Datum
+ColumnStatsTextToDatum(char *text, PGType pgType)
+{
+	Oid			typoinput;
+	Oid			typioparam;
+
+	getTypeInputInfo(pgType.postgresTypeOid, &typoinput, &typioparam);
+
+	return OidInputFunctionCall(typoinput, text, typioparam, -1);
+}
+
+
+/*
+ * DatumToColumnStatsText converts the given datum to text for the given pgType.
+ */
+static char *
+DatumToColumnStatsText(Datum datum, PGType pgType, bool isNull)
+{
+	if (isNull)
+	{
+		return NULL;
+	}
+
+	Oid			typoutput;
+	bool		typIsVarlena;
+
+	getTypeOutputInfo(pgType.postgresTypeOid, &typoutput, &typIsVarlena);
+
+	return OidOutputFunctionCall(typoutput, datum);
+}
diff --git a/pg_lake_engine/src/pgduck/delete_data.c b/pg_lake_engine/src/pgduck/delete_data.c
index 0e4071c9..ffea2ef0 100644
--- a/pg_lake_engine/src/pgduck/delete_data.c
+++ b/pg_lake_engine/src/pgduck/delete_data.c
@@ -97,11 +97,12 @@ PerformDeleteFromParquet(char *sourcePath,
 	/* end WITH options */
 	appendStringInfoString(&command, ")");
 
-	return ExecuteCopyCommandOnPGDuckConnection(command.data,
-												leafFields,
-												schema,
-												false,
-												DATA_FORMAT_PARQUET);
+	return ExecuteCopyToCommandOnPGDuckConnection(command.data,
+												  leafFields,
+												  schema,
+												  false,
+												  destinationPath,
+												  DATA_FORMAT_PARQUET);
 }
 
 
diff --git a/pg_lake_engine/src/pgduck/write_data.c b/pg_lake_engine/src/pgduck/write_data.c
index 0bd01eae..96390430 100644
--- a/pg_lake_engine/src/pgduck/write_data.c
+++ b/pg_lake_engine/src/pgduck/write_data.c
@@ -394,11 +394,12 @@ WriteQueryResultTo(char *query,
 
 	bool		disablePreserveInsertionOrder = TargetRowGroupSizeMB > 0;
 
-	return ExecuteCopyCommandOnPGDuckConnection(command.data,
-												leafFields,
-												schema,
-												disablePreserveInsertionOrder,
-												destinationFormat);
+	return ExecuteCopyToCommandOnPGDuckConnection(command.data,
+												  leafFields,
+												  schema,
+												  disablePreserveInsertionOrder,
+												  destinationPath,
+												  destinationFormat);
 }
 
 
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
index e0599c9f..ce5c73d5 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
@@ -34,4 +34,3 @@ extern PGDLLEXPORT const char *IcebergTypeNameToDuckdbTypeName(const char *icebe
 extern PGDLLEXPORT DataFileSchema * CreatePositionDeleteDataFileSchema(void);
 extern PGDLLEXPORT const char *GetIcebergJsonSerializedDefaultExpr(TupleDesc tupdesc, AttrNumber attnum,
 																   FieldStructElement * structElementField);
-extern PGDLLEXPORT List *GetRemoteParquetColumnStats(char *path, List *leafFields);
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index d792ceef..6431f273 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -100,23 +100,6 @@ typedef struct IcebergToDuckDBType
 }			IcebergToDuckDBType;
 
 
-/*
-* The output is in the format of:
-*     field_id, ARRAY[val1, val2, val3.., valN]
-*
-* The array values are NOT yet sorted, they are the stats_min and stats_max values
-* from the parquet metadata. We put min and max values in the same array to because
-* we want the global ordering of the values, not per row group.
-*
-* Also note that the values are in string format, and need to be converted to the
-* appropriate type before being sorted.
-*/
-typedef struct RowGroupStats
-{
-	LeafField  *leafField;
-	ArrayType  *minMaxArray;
-}			RowGroupStats;
-
 static IcebergToDuckDBType IcebergToDuckDBTypes[] =
 {
 	{
@@ -176,11 +159,6 @@ static DuckDBType GetDuckDBTypeFromIcebergType(IcebergType icebergType);
 static char *PostgresBaseTypeIdToIcebergTypeName(PGType pgType);
 static IcebergTypeInfo * GetIcebergTypeInfoFromTypeName(const char *typeName);
 static const char *GetIcebergJsonSerializedConstDefaultIfExists(const char *attrName, Field * field, Node *defaultExpr);
-static List *FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path);
-static char *PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList);
-static char *SerializeTextArrayTypeToPgDuck(ArrayType *array);
-static ArrayType *ReadArrayFromText(char *arrayText);
-static List *GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatsList);
 
 
 /*
@@ -895,365 +873,3 @@ EnsureIcebergField(Field * field)
 
 #endif
 }
-
-
-/*
-  * GetRemoteParquetColumnStats gets the stats for each leaf field
-  * in a remote Parquet file.
-  */
-List *
-GetRemoteParquetColumnStats(char *path, List *leafFields)
-{
-	if (list_length(leafFields) == 0)
-	{
-		/*
-		 * short circuit for empty list, otherwise need to adjust the below
-		 * query
-		 */
-		return NIL;
-	}
-
-	/*
-	 * Sort the leaf fields by fieldId, and then use ORDER BY in the query to
-	 * ensure that the results are in the same order as the input list.
-	 */
-	List	   *leafFieldsCopy = list_copy(leafFields);
-
-	list_sort(leafFieldsCopy, LeafFieldCompare);
-
-	PGDuckConnection *pgDuckConn = GetPGDuckConnection();
-
-	List	   *rowGroupStatsList = FetchRowGroupStats(pgDuckConn, leafFieldsCopy, path);
-
-	if (list_length(rowGroupStatsList) == 0)
-	{
-		/* no stats available */
-		ReleasePGDuckConnection(pgDuckConn);
-		return NIL;
-	}
-
-	List	   *columnStatsList = GetFieldMinMaxStats(pgDuckConn, rowGroupStatsList);
-
-	ReleasePGDuckConnection(pgDuckConn);
-	return columnStatsList;
-}
-
-
-/*
-* FetchRowGroupStats fetches the statistics for the given leaf fields.
-* The output is in the format of:
-*     field_id, ARRAY[val1, val2, val3.., valN]
-*     field_id, ARRAY[val1, val2, val3.., valN]
-*    ...
-* The array values are NOT yet sorted, they are the stats_min and stats_max values
-* from the parquet metadata. We put min and max values in the same array to because
-* we want the global ordering of the values, not per row group.
-*
-* Also note that the values are in string format, and need to be converted to the
-* appropriate type before being sorted.
-*
-* The output is sorted by the input fieldIdList.
-*/
-static List *
-FetchRowGroupStats(PGDuckConnection * pgDuckConn, List *fieldIdList, char *path)
-{
-	List	   *rowGroupStatsList = NIL;
-
-	StringInfo	query = makeStringInfo();
-
-	appendStringInfo(query,
-
-	/*
-	 * column_id_field_id_mapping: maps the column_id to the field_id for all
-	 * the leaf fields. We come up with this mapping by checking the DuckDB
-	 * source code, we should be careful if they ever break this assumption.
-	 */
-					 "WITH column_id_field_id_mapping AS ( "
-					 "	SELECT row_number() OVER () - 1 AS column_id, field_id "
-					 "	FROM parquet_schema(%s)   "
-					 "	WHERE num_children IS NULL and field_id <> "
-					 PG_LAKE_TOSTRING(ICEBERG_ROWID_FIELD_ID)
-					 "), "
-
-	/*
-	 * Fetch the parquet metadata per column_id. For each column_id, we may
-	 * get multiple row groups, and we need to aggregate the stats_min and
-	 * stats_max values for each column_id.
-	 */
-					 "parquet_metadata AS ( "
-					 "		SELECT column_id, stats_min, stats_min_value, stats_max, stats_max_value "
-					 "		FROM parquet_metadata(%s)), "
-
-	/*
-	 * Now, we aggregate the stats_min and stats_max values for each
-	 * column_id. Note that we use the coalesce function to handle the case
-	 * where stats_min is NULL, and we use the stats_min_value instead. We
-	 * currently don't have a good grasp on when DuckDB uses stats_min vs
-	 * stats_min_value, so we use both. Typically both is set to the same
-	 * value, but we want to be safe. We use the array_agg function to collect
-	 * all the min/max values into an array, and values are not casted to the
-	 * appropriate type yet, we create a text array. Finding min/max values
-	 * for different data types in the same query is tricky as there is no
-	 * support for casting to a type with a dynamic type name. So, doing it in
-	 * two queries is easier to understand/maintain.
-	 */
-					 "row_group_aggs AS ( "
-					 "SELECT c.field_id,  "
-					 "       array_agg(CAST(coalesce(m.stats_min, m.stats_min_value) AS TEXT)) "
-					 "                 FILTER (WHERE m.stats_min IS NOT NULL OR m.stats_min_value IS NOT NULL) || "
-					 "       array_agg(CAST(coalesce(m.stats_max, m.stats_max_value) AS TEXT)) "
-					 "                  FILTER (WHERE m.stats_max IS NOT NULL OR m.stats_max_value IS NOT NULL)  AS values "
-					 "FROM column_id_field_id_mapping c "
-					 "JOIN parquet_metadata m USING (column_id) "
-					 "GROUP BY c.field_id) "
-					 "SELECT field_id, values FROM row_group_aggs ORDER BY field_id;",
-					 quote_literal_cstr(path), quote_literal_cstr(path));
-
-	PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, query->data);
-
-	/* throw error if anything failed  */
-	CheckPGDuckResult(pgDuckConn, result);
-
-	/* make sure we PQclear the result */
-	PG_TRY();
-	{
-		int			rowCount = PQntuples(result);
-
-		for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
-		{
-			if (PQgetisnull(result, rowIndex, 0))
-			{
-				/* the data file doesn't have field id */
-				continue;
-			}
-
-			int			fieldId = atoi(PQgetvalue(result, rowIndex, 0));
-			LeafField  *leafField = FindLeafField(fieldIdList, fieldId);
-
-			if (leafField == NULL)
-				/* dropped column for external iceberg tables */
-				continue;
-
-			if (ShouldSkipStatistics(leafField))
-				continue;
-
-			char	   *minMaxArrayText = NULL;
-
-			if (!PQgetisnull(result, rowIndex, 1))
-			{
-				minMaxArrayText = pstrdup(PQgetvalue(result, rowIndex, 1));
-			}
-
-			RowGroupStats *rowGroupStats = palloc0(sizeof(RowGroupStats));
-
-			rowGroupStats->leafField = leafField;
-			rowGroupStats->minMaxArray = minMaxArrayText ? ReadArrayFromText(minMaxArrayText) : NULL;
-
-			rowGroupStatsList = lappend(rowGroupStatsList, rowGroupStats);
-		}
-	}
-	PG_CATCH();
-	{
-		PQclear(result);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
-
-	PQclear(result);
-
-	return rowGroupStatsList;
-}
-
-
-/*
-* For the given rowGroupStatList, prepare the query to get the min and max values
-* for each field. In the end, we will have a query like:
-* 		SELECT 1,
-*			   list_aggregate(CAST(min_max_array AS type[]), 'min') as field_1_min,
-*			   list_aggregate(CAST(min_max_array AS type[]), 'max') as field_1_max,
-*			   2,
-*			   list_aggregate(CAST(min_max_array AS type[]), 'min') as field_2_min,
-*			   list_aggregate(CAST(min_max_array AS type[]), 'max') as field_2_max,
-*			   ...
-* We are essentially aggregating the min and max values for each field in the same query. This scales
-* better than UNION ALL queries for each field.
-*/
-static char *
-PrepareRowGroupStatsMinMaxQuery(List *rowGroupStatList)
-{
-	StringInfo	query = makeStringInfo();
-
-	ListCell   *lc;
-
-	appendStringInfo(query, "SELECT ");
-
-	foreach(lc, rowGroupStatList)
-	{
-		RowGroupStats *rowGroupStats = lfirst(lc);
-		LeafField  *leafField = rowGroupStats->leafField;
-		int			fieldId = leafField->fieldId;
-
-		if (rowGroupStats->minMaxArray != NULL)
-		{
-			char	   *reserializedArray = SerializeTextArrayTypeToPgDuck(rowGroupStats->minMaxArray);
-
-			appendStringInfo(query, " %d, list_aggregate(CAST(%s AS %s[]), 'min') as field_%d_min, "
-							 "list_aggregate(CAST(%s AS %s[]), 'max')  as field_%d_min, ",
-							 fieldId,
-							 quote_literal_cstr(reserializedArray), leafField->duckTypeName, fieldId,
-							 quote_literal_cstr(reserializedArray), leafField->duckTypeName, fieldId);
-		}
-		else
-		{
-			appendStringInfo(query, " %d, NULL  as field_%d_min, NULL  as field_%d_min, ", fieldId, fieldId, fieldId);
-		}
-	}
-
-	return query->data;
-}
-
-
-/*
-* The input array is in the format of {val1, val2, val3, ..., valN},
-* and element type is text. Serialize it to text in DuckDB format.
-*/
-static char *
-SerializeTextArrayTypeToPgDuck(ArrayType *array)
-{
-	Datum		arrayDatum = PointerGetDatum(array);
-
-	FmgrInfo	outFunc;
-	Oid			outFuncId = InvalidOid;
-	bool		isvarlena = false;
-
-	getTypeOutputInfo(TEXTARRAYOID, &outFuncId, &isvarlena);
-	fmgr_info(outFuncId, &outFunc);
-
-	return PGDuckSerialize(&outFunc, TEXTARRAYOID, arrayDatum);
-}
-
-
-/*
-* ReadArrayFromText reads the array from the given text.
-*/
-static ArrayType *
-ReadArrayFromText(char *arrayText)
-{
-	Oid			funcOid = F_ARRAY_IN;
-
-	FmgrInfo	flinfo;
-
-	fmgr_info(funcOid, &flinfo);
-
-	/* array in has 3 arguments */
-	LOCAL_FCINFO(fcinfo, 3);
-
-	InitFunctionCallInfoData(*fcinfo,
-							 &flinfo,
-							 3,
-							 InvalidOid,
-							 NULL,
-							 NULL);
-
-	fcinfo->args[0].value = CStringGetDatum(arrayText);
-	fcinfo->args[0].isnull = false;
-
-	fcinfo->args[1].value = ObjectIdGetDatum(TEXTOID);
-	fcinfo->args[1].isnull = false;
-
-	fcinfo->args[2].value = Int32GetDatum(-1);
-	fcinfo->args[2].isnull = false;
-
-	Datum		result = FunctionCallInvoke(fcinfo);
-
-	if (fcinfo->isnull)
-	{
-		/* not expected given we only call this for non-null text */
-		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
-						errmsg("could not reserialize text array")));
-	}
-
-	return DatumGetArrayTypeP(result);
-}
-
-/*
-* GetFieldMinMaxStats gets the min and max values for each field in the given rowGroupedStatList.
-* In this function, we create a query where we first cast the minMaxArray to the appropriate type
-* and then aggregate the min and max values for each field.
-*/
-static List *
-GetFieldMinMaxStats(PGDuckConnection * pgDuckConn, List *rowGroupStatList)
-{
-	char	   *query = PrepareRowGroupStatsMinMaxQuery(rowGroupStatList);
-
-	PGresult   *result = ExecuteQueryOnPGDuckConnection(pgDuckConn, query);
-
-	/* throw error if anything failed  */
-	CheckPGDuckResult(pgDuckConn, result);
-
-	List	   *columnStatsList = NIL;
-
-#ifdef USE_ASSERT_CHECKING
-
-	/*
-	 * We never omit any entries from the rowGroupStatList, and for each
-	 * rowGroupStatList entry, we have 3 columns: fieldId, minValue and
-	 * maxValue.
-	 */
-	int			rowGroupLength = list_length(rowGroupStatList);
-
-	Assert(PQnfields(result) == rowGroupLength * 3);
-#endif
-
-	PG_TRY();
-	{
-		for (int columnIndex = 0; columnIndex < PQnfields(result); columnIndex = columnIndex + 3)
-		{
-			DataFileColumnStats *columnStats = palloc0(sizeof(DataFileColumnStats));
-			int			rowGroupIndex = columnIndex / 3;
-
-			RowGroupStats *rowGroupStats = list_nth(rowGroupStatList, rowGroupIndex);
-			LeafField  *leafField = rowGroupStats->leafField;
-
-#ifdef USE_ASSERT_CHECKING
-			/* we use a sorted rowGroupStatList, so should be */
-			int			fieldId = atoi(PQgetvalue(result, 0, columnIndex));
-
-			Assert(leafField->fieldId == fieldId);
-#endif
-
-			columnStats->leafField = *leafField;
-
-			int			lowerBoundIndex = columnIndex + 1;
-
-			if (!PQgetisnull(result, 0, lowerBoundIndex))
-			{
-				/* the data file doesn't have field id */
-				columnStats->lowerBoundText = pstrdup(PQgetvalue(result, 0, lowerBoundIndex));
-			}
-			else
-				columnStats->lowerBoundText = NULL;
-
-			int			upperBoundIndex = columnIndex + 2;
-
-			if (!PQgetisnull(result, 0, upperBoundIndex))
-			{
-				/* the data file doesn't have field id */
-				columnStats->upperBoundText = pstrdup(PQgetvalue(result, 0, upperBoundIndex));
-			}
-			else
-				columnStats->upperBoundText = NULL;
-
-			columnStatsList = lappend(columnStatsList, columnStats);
-		}
-	}
-	PG_CATCH();
-	{
-		PQclear(result);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
-
-	PQclear(result);
-	return columnStatsList;
-}
diff --git a/pg_lake_table/include/pg_lake/fdw/data_file_stats.h b/pg_lake_table/include/pg_lake/fdw/data_file_stats.h
deleted file mode 100644
index 442c6835..00000000
--- a/pg_lake_table/include/pg_lake/fdw/data_file_stats.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2025 Snowflake Inc.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "postgres.h"
-
-#include "pg_lake/data_file/data_file_stats.h"
-#include "pg_lake/data_file/data_files.h"
-
-/*
- * ColumnStatsMode describes the mode of column stats.
- * - When truncate mode (default) is used, the column stats are truncated
- *   to the given length.
- * - When none mode is used, the column stats are not collected.
- */
-typedef enum ColumnStatsMode
-{
-	COLUMN_STATS_MODE_TRUNCATE = 0,
-	COLUMN_STATS_MODE_NONE = 1,
-}			ColumnStatsMode;
-
-/*
- * ColumnStatsConfig describes the configuration for column stats.
- * - mode: the mode of column stats.
- * - truncateLen: the length to truncate the column stats in truncate mode.
- */
-typedef struct ColumnStatsConfig
-{
-	ColumnStatsMode mode;
-
-	/* used for truncate mode */
-	size_t		truncateLen;
-}			ColumnStatsConfig;
-
-extern PGDLLEXPORT DataFileStats * CreateDataFileStatsForTable(Oid relationId, char *dataFilePath,
-															   int64 rowCount, int64 deletedRowCount,
-															   DataFileContent content);
-extern PGDLLEXPORT DataFileColumnStats * CreateDataFileColumnStats(int fieldId, PGType pgType,
-																   char *lowerBoundText,
-																   char *upperBoundText);
-extern PGDLLEXPORT void ApplyColumnStatsMode(Oid relationId, List *columnStats);
-extern PGDLLEXPORT void ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats);
diff --git a/pg_lake_table/src/fdw/data_file_stats.c b/pg_lake_table/src/fdw/data_file_stats.c
deleted file mode 100644
index 174074b2..00000000
--- a/pg_lake_table/src/fdw/data_file_stats.c
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright 2025 Snowflake Inc.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "postgres.h"
-
-#include "pg_lake/fdw/data_file_stats.h"
-#include "pg_lake/fdw/schema_operations/register_field_ids.h"
-#include "pg_lake/parsetree/options.h"
-#include "pg_lake/pgduck/remote_storage.h"
-#include "pg_lake/util/rel_utils.h"
-
-#include "commands/defrem.h"
-#include "foreign/foreign.h"
-#include "utils/lsyscache.h"
-
-
-static ColumnStatsConfig GetColumnStatsConfig(Oid relationId);
-static void ApplyColumnStatsModeForType(ColumnStatsConfig columnStatsConfig,
-										PGType pgType, char **lowerBoundText,
-										char **upperBoundText);
-static char *TruncateStatsMinForText(char *lowerBound, size_t truncateLen);
-static char *TruncateStatsMaxForText(char *upperBound, size_t truncateLen);
-static bytea *TruncateStatsMinForBinary(bytea *lowerBound, size_t truncateLen);
-static bytea *TruncateStatsMaxForBinary(bytea *upperBound, size_t truncateLen);
-static Datum ColumnStatsTextToDatum(char *text, PGType pgType);
-static char *DatumToColumnStatsText(Datum datum, PGType pgType, bool isNull);
-
-/*
- * CreateDataFileStatsForTable creates the data file stats for the given table's
- * data file. It uses already calculated file level stats. And sends remote queries
- * to the file to extract the column level stats.
- */
-DataFileStats *
-CreateDataFileStatsForTable(Oid relationId, char *dataFilePath, int64 rowCount,
-							int64 deletedRowCount, DataFileContent content)
-{
-	PgLakeTableProperties properties = GetPgLakeTableProperties(relationId);
-
-	List	   *columnStats;
-
-	if (properties.tableType == PG_LAKE_ICEBERG_TABLE_TYPE && content == CONTENT_DATA)
-	{
-		List	   *leafFields = GetLeafFieldsForTable(relationId);
-
-		columnStats = GetRemoteParquetColumnStats(dataFilePath, leafFields);
-
-		ApplyColumnStatsMode(relationId, columnStats);
-	}
-	else
-	{
-		columnStats = NIL;
-	}
-
-	int64		fileSize = GetRemoteFileSize(dataFilePath);
-
-	DataFileStats *dataFileStats = palloc0(sizeof(DataFileStats));
-
-	dataFileStats->fileSize = fileSize;
-	dataFileStats->rowCount = rowCount;
-	dataFileStats->deletedRowCount = deletedRowCount;
-	dataFileStats->columnStats = columnStats;
-
-	return dataFileStats;
-}
-
-
-/*
- * CreateDataFileColumnStats creates a new DataFileColumnStats from the given
- * parameters.
- */
-DataFileColumnStats *
-CreateDataFileColumnStats(int fieldId, PGType pgType, char *lowerBoundText, char *upperBoundText)
-{
-	DataFileColumnStats *columnStats = palloc0(sizeof(DataFileColumnStats));
-
-	columnStats->leafField.fieldId = fieldId;
-	columnStats->lowerBoundText = lowerBoundText;
-	columnStats->upperBoundText = upperBoundText;
-	columnStats->leafField.pgType = pgType;
-
-	bool		forAddColumn = false;
-	int			subFieldIndex = fieldId;
-
-	Field	   *field = PostgresTypeToIcebergField(pgType, forAddColumn, &subFieldIndex);
-
-	Assert(field->type == FIELD_TYPE_SCALAR);
-
-	columnStats->leafField.field = field;
-
-	const char *duckTypeName = IcebergTypeNameToDuckdbTypeName(field->field.scalar.typeName);
-
-	columnStats->leafField.duckTypeName = duckTypeName;
-
-	return columnStats;
-}
-
-
-/*
- * ApplyColumnStatsMode applies the column stats mode to the given lower and upper
- * bound text.
- *
- * e.g. with "truncate(3)"
- * "abcdef" -> lowerbound: "abc" upperbound: "abd"
- * "\x010203040506" -> lowerbound: "\x010203" upperbound: "\x010204"
- *
- * e.g. with "full"
- * "abcdef" -> lowerbound: "abcdef" upperbound: "abcdef"
- * "\x010203040506" -> lowerbound: "\x010203040506" upperbound: "\x010203040506"
- *
- * e.g. with "none"
- * "abcdef" -> lowerbound: NULL upperbound: NULL
- * "\x010203040506" -> lowerbound: NULL upperbound: NULL
- */
-void
-ApplyColumnStatsMode(Oid relationId, List *columnStats)
-{
-	ColumnStatsConfig columnStatsConfig = GetColumnStatsConfig(relationId);
-
-	ListCell   *columnStatsCell = NULL;
-
-	foreach(columnStatsCell, columnStats)
-	{
-		DataFileColumnStats *columnStats = lfirst(columnStatsCell);
-
-		char	  **lowerBoundText = &columnStats->lowerBoundText;
-		char	  **upperBoundText = &columnStats->upperBoundText;
-
-		ApplyColumnStatsModeForType(columnStatsConfig, columnStats->leafField.pgType, lowerBoundText, upperBoundText);
-	}
-}
-
-
-void
-ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats)
-{
-	ColumnStatsConfig columnStatsConfig = GetColumnStatsConfig(relationId);
-
-	ListCell   *dataFileStatsCell = NULL;
-
-	foreach(dataFileStatsCell, dataFileStats)
-	{
-		DataFileStats *dataFileStats = lfirst(dataFileStatsCell);
-
-		ListCell   *columnStatsCell = NULL;
-
-		foreach(columnStatsCell, dataFileStats->columnStats)
-		{
-			DataFileColumnStats *columnStats = lfirst(columnStatsCell);
-			char	  **lowerBoundText = &columnStats->lowerBoundText;
-			char	  **upperBoundText = &columnStats->upperBoundText;
-
-			ApplyColumnStatsModeForType(columnStatsConfig, columnStats->leafField.pgType, lowerBoundText, upperBoundText);
-		}
-	}
-}
-
-
-/*
- * GetColumnStatsConfig returns the column stats config for the given
- * relation.
- */
-static ColumnStatsConfig
-GetColumnStatsConfig(Oid relationId)
-{
-	ForeignTable *foreignTable = GetForeignTable(relationId);
-	List	   *options = foreignTable->options;
-	DefElem    *columnStatsModeOption = GetOption(options, "column_stats_mode");
-
-	ColumnStatsConfig config;
-
-	/* default to truncate mode */
-	if (columnStatsModeOption == NULL)
-	{
-		config.mode = COLUMN_STATS_MODE_TRUNCATE;
-		config.truncateLen = 16;
-
-		return config;
-	}
-
-	char	   *columnStatsMode = ToLowerCase(defGetString(columnStatsModeOption));
-
-	if (sscanf(columnStatsMode, "truncate(%zu)", &config.truncateLen) == 1)
-	{
-		config.mode = COLUMN_STATS_MODE_TRUNCATE;
-		if (config.truncateLen > 256)
-			ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-							errmsg("truncate() cannot exceed 256")));
-	}
-	else if (strcmp(columnStatsMode, "full") == 0)
-	{
-		config.mode = COLUMN_STATS_MODE_TRUNCATE;
-		config.truncateLen = 256;
-	}
-	else if (strcmp(columnStatsMode, "none") == 0)
-	{
-		config.mode = COLUMN_STATS_MODE_NONE;
-	}
-	else
-	{
-		/* iceberg fdw validator already validated */
-		pg_unreachable();
-	}
-
-	return config;
-}
-
-
-/*
- * ApplyColumnStatsModeForType applies the column stats mode to the given lower and upper
- * bound text for the given pgType.
- */
-static void
-ApplyColumnStatsModeForType(ColumnStatsConfig columnStatsConfig,
-							PGType pgType, char **lowerBoundText,
-							char **upperBoundText)
-{
-	if (*lowerBoundText == NULL)
-	{
-		return;
-	}
-
-	Assert(*upperBoundText != NULL);
-
-	if (columnStatsConfig.mode == COLUMN_STATS_MODE_TRUNCATE)
-	{
-		size_t		truncateLen = columnStatsConfig.truncateLen;
-
-		/* only text and binary types can be truncated */
-		if (pgType.postgresTypeOid == TEXTOID ||
-			pgType.postgresTypeOid == VARCHAROID ||
-			pgType.postgresTypeOid == BPCHAROID)
-		{
-			*lowerBoundText = TruncateStatsMinForText(*lowerBoundText, truncateLen);
-			Assert(*lowerBoundText != NULL);
-
-			/* could be null if overflow occurred */
-			*upperBoundText = TruncateStatsMaxForText(*upperBoundText, truncateLen);
-		}
-		else if (pgType.postgresTypeOid == BYTEAOID)
-		{
-			/*
-			 * convert from text repr (e.g. '\x0102ef') to bytea to apply
-			 * truncate
-			 */
-			Datum		lowerBoundDatum = ColumnStatsTextToDatum(*lowerBoundText, pgType);
-			Datum		upperBoundDatum = ColumnStatsTextToDatum(*upperBoundText, pgType);
-
-			/* truncate bytea */
-			bytea	   *truncatedLowerBoundBinary = TruncateStatsMinForBinary(DatumGetByteaP(lowerBoundDatum),
-																			  truncateLen);
-			bytea	   *truncatedUpperBoundBinary = TruncateStatsMaxForBinary(DatumGetByteaP(upperBoundDatum),
-																			  truncateLen);
-
-			/* convert bytea back to text representation */
-			Assert(truncatedLowerBoundBinary != NULL);
-			*lowerBoundText = DatumToColumnStatsText(PointerGetDatum(truncatedLowerBoundBinary),
-													 pgType, false);
-
-			/* could be null if overflow occurred */
-			*upperBoundText = DatumToColumnStatsText(PointerGetDatum(truncatedUpperBoundBinary),
-													 pgType, truncatedUpperBoundBinary == NULL);
-		}
-	}
-	else if (columnStatsConfig.mode == COLUMN_STATS_MODE_NONE)
-	{
-		*lowerBoundText = NULL;
-		*upperBoundText = NULL;
-	}
-	else
-	{
-		Assert(false);
-	}
-}
-
-
-/*
- * TruncateStatsMinForText truncates the given lower bound text to the given length.
- */
-static char *
-TruncateStatsMinForText(char *lowerBound, size_t truncateLen)
-{
-	if (strlen(lowerBound) <= truncateLen)
-	{
-		return lowerBound;
-	}
-
-	lowerBound[truncateLen] = '\0';
-
-	return lowerBound;
-}
-
-
-/*
- * TruncateStatsMaxForText truncates the given upper bound text to the given length.
- */
-static char *
-TruncateStatsMaxForText(char *upperBound, size_t truncateLen)
-{
-	if (strlen(upperBound) <= truncateLen)
-	{
-		return upperBound;
-	}
-
-	upperBound[truncateLen] = '\0';
-
-	/*
-	 * increment the last byte of the upper bound, which does not overflow. If
-	 * not found, return null.
-	 */
-	for (int i = truncateLen - 1; i >= 0; i--)
-	{
-		/* check if overflows max ascii char */
-		/* todo: how to handle utf8 or different encoding? */
-		if (upperBound[i] != INT8_MAX)
-		{
-			upperBound[i]++;
-			return upperBound;
-		}
-	}
-
-	return NULL;
-}
-
-
-/*
- * TruncateStatsMinForBinary truncates the given lower bound binary to the given length.
- */
-static bytea *
-TruncateStatsMinForBinary(bytea *lowerBound, size_t truncateLen)
-{
-	size_t		lowerBoundLen = VARSIZE_ANY_EXHDR(lowerBound);
-
-	if (lowerBoundLen <= truncateLen)
-	{
-		return lowerBound;
-	}
-
-	bytea	   *truncatedLowerBound = palloc0(truncateLen + VARHDRSZ);
-
-	SET_VARSIZE(truncatedLowerBound, truncateLen + VARHDRSZ);
-	memcpy(VARDATA_ANY(truncatedLowerBound), VARDATA_ANY(lowerBound), truncateLen);
-
-	return truncatedLowerBound;
-}
-
-
-/*
- * TruncateStatsMaxForBinary truncates the given upper bound binary to the given length.
- */
-static bytea *
-TruncateStatsMaxForBinary(bytea *upperBound, size_t truncateLen)
-{
-	size_t		upperBoundLen = VARSIZE_ANY_EXHDR(upperBound);
-
-	if (upperBoundLen <= truncateLen)
-	{
-		return upperBound;
-	}
-
-	bytea	   *truncatedUpperBound = palloc0(truncateLen + VARHDRSZ);
-
-	SET_VARSIZE(truncatedUpperBound, truncateLen + VARHDRSZ);
-	memcpy(VARDATA_ANY(truncatedUpperBound), VARDATA_ANY(upperBound), truncateLen);
-
-	/*
-	 * increment the last byte of the upper bound, which does not overflow. If
-	 * not found, return null.
-	 */
-	for (int i = truncateLen - 1; i >= 0; i--)
-	{
-		/* check if overflows max byte */
-		if ((unsigned char) VARDATA_ANY(truncatedUpperBound)[i] != UINT8_MAX)
-		{
-			VARDATA_ANY(truncatedUpperBound)[i]++;
-			return truncatedUpperBound;
-		}
-	}
-
-	return NULL;
-}
-
-
-/*
- * ColumnStatsTextToDatum converts the given text to Datum for the given pgType.
- */
-static Datum
-ColumnStatsTextToDatum(char *text, PGType pgType)
-{
-	Oid			typoinput;
-	Oid			typioparam;
-
-	getTypeInputInfo(pgType.postgresTypeOid, &typoinput, &typioparam);
-
-	return OidInputFunctionCall(typoinput, text, typioparam, -1);
-}
-
-
-/*
- * DatumToColumnStatsText converts the given datum to text for the given pgType.
- */
-static char *
-DatumToColumnStatsText(Datum datum, PGType pgType, bool isNull)
-{
-	if (isNull)
-	{
-		return NULL;
-	}
-
-	Oid			typoutput;
-	bool		typIsVarlena;
-
-	getTypeOutputInfo(pgType.postgresTypeOid, &typoutput, &typIsVarlena);
-
-	return OidOutputFunctionCall(typoutput, datum);
-}
diff --git a/pg_lake_table/src/fdw/data_file_stats_catalog.c b/pg_lake_table/src/fdw/data_file_stats_catalog.c
index d2de5ddf..2cf643fb 100644
--- a/pg_lake_table/src/fdw/data_file_stats_catalog.c
+++ b/pg_lake_table/src/fdw/data_file_stats_catalog.c
@@ -21,10 +21,10 @@
 #include "miscadmin.h"
 
 #include "pg_lake/data_file/data_files.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/extensions/pg_lake_table.h"
 #include "pg_lake/fdw/data_files_catalog.h"
 #include "pg_lake/fdw/data_file_stats_catalog.h"
-#include "pg_lake/fdw/data_file_stats.h"
 #include "pg_lake/util/spi_helpers.h"
 
 #include "catalog/namespace.h"
diff --git a/pg_lake_table/src/fdw/data_files_catalog.c b/pg_lake_table/src/fdw/data_files_catalog.c
index a6103d12..f1533211 100644
--- a/pg_lake_table/src/fdw/data_files_catalog.c
+++ b/pg_lake_table/src/fdw/data_files_catalog.c
@@ -32,9 +32,7 @@
 #include "pg_lake/extensions/extension_ids.h"
 #include "pg_lake/extensions/pg_lake_engine.h"
 #include "pg_lake/fdw/catalog/row_id_mappings.h"
-#include "pg_lake/fdw/data_file_stats.h"
 #include "pg_lake/fdw/data_files_catalog.h"
-#include "pg_lake/fdw/data_file_stats.h"
 #include "pg_lake/fdw/data_file_stats_catalog.h"
 #include "pg_lake/fdw/schema_operations/field_id_mapping_catalog.h"
 #include "pg_lake/fdw/writable_table.h"
@@ -96,7 +94,9 @@ static bool ColumnStatAlreadyAdded(List *columnStats, int64 fieldId);
 static bool PartitionFieldAlreadyAdded(Partition * partition, int64 fieldId);
 static void CreateTxDataFileIdsTempTableIfNotExists(void);
 static void InsertDataFileIdIntoTransactionTable(int64 fileId);
-
+static DataFileColumnStats * CreateDataFileColumnStats(int fieldId, PGType pgType,
+													   char *lowerBoundText,
+													   char *upperBoundText);
 
 /*
  * GetTableDataFilesFromCatalog returns a list of TableDataFile for each data and deletion file
@@ -1365,3 +1365,34 @@ AddDataFilePartitionValueToCatalog(Oid relationId, int32 partitionSpecId, int64
 
 	SetUserIdAndSecContext(savedUserId, savedSecurityContext);
 }
+
+
+/*
+ * CreateDataFileColumnStats creates a new DataFileColumnStats from the given
+ * parameters.
+ */
+static DataFileColumnStats *
+CreateDataFileColumnStats(int fieldId, PGType pgType, char *lowerBoundText, char *upperBoundText)
+{
+	DataFileColumnStats *columnStats = palloc0(sizeof(DataFileColumnStats));
+
+	columnStats->leafField.fieldId = fieldId;
+	columnStats->lowerBoundText = lowerBoundText;
+	columnStats->upperBoundText = upperBoundText;
+	columnStats->leafField.pgType = pgType;
+
+	bool		forAddColumn = false;
+	int			subFieldIndex = fieldId;
+
+	Field	   *field = PostgresTypeToIcebergField(pgType, forAddColumn, &subFieldIndex);
+
+	Assert(field->type == FIELD_TYPE_SCALAR);
+
+	columnStats->leafField.field = field;
+
+	const char *duckTypeName = IcebergTypeNameToDuckdbTypeName(field->field.scalar.typeName);
+
+	columnStats->leafField.duckTypeName = duckTypeName;
+
+	return columnStats;
+}
diff --git a/pg_lake_table/src/fdw/multi_data_file_dest.c b/pg_lake_table/src/fdw/multi_data_file_dest.c
index 16e88880..8765c6fb 100644
--- a/pg_lake_table/src/fdw/multi_data_file_dest.c
+++ b/pg_lake_table/src/fdw/multi_data_file_dest.c
@@ -24,7 +24,7 @@
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/csv/csv_options.h"
 #include "pg_lake/csv/csv_writer.h"
-#include "pg_lake/fdw/data_file_stats.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/fdw/data_files_catalog.h"
 #include "pg_lake/fdw/multi_data_file_dest.h"
 #include "pg_lake/fdw/writable_table.h"
@@ -235,15 +235,7 @@ FlushChildDestReceiver(MultiDataFileUploadDestReceiver * self)
 
 		copyModification->partitionSpecId = self->currentPartitionSpecId;
 		copyModification->partition = modification->partition;
-		if (modification->fileStats != NULL)
-		{
-			copyModification->fileStats = DeepCopyDataFileStats(modification->fileStats);
-		}
-		else
-		{
-			copyModification->fileStats =
-				CreateDataFileStatsForTable(self->relationId, copyModification->insertFile, copyModification->insertedRowCount, 0, CONTENT_DATA);
-		}
+		copyModification->fileStats = DeepCopyDataFileStats(modification->fileStats);
 
 		/*
 		 * If caller of dest receiver is assigning rowids itself,
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index b314e248..9138d1bd 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -28,12 +28,12 @@
 #include "common/hashfn.h"
 #include "pg_lake/cleanup/in_progress_files.h"
 #include "pg_lake/data_file/data_files.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/cleanup/deletion_queue.h"
 #include "pg_lake/extensions/pg_lake_table.h"
 #include "pg_lake/fdw/catalog/row_id_mappings.h"
 #include "pg_lake/fdw/pg_lake_table.h"
 #include "pg_lake/fdw/data_files_catalog.h"
-#include "pg_lake/fdw/data_file_stats.h"
 #include "pg_lake/fdw/row_ids.h"
 #include "pg_lake/fdw/writable_table.h"
 #include "pg_lake/fdw/partition_transform.h"
diff --git a/pg_lake_table/src/test/add_files_to_table.c b/pg_lake_table/src/test/add_files_to_table.c
index 8b1e09c2..f82f1270 100644
--- a/pg_lake_table/src/test/add_files_to_table.c
+++ b/pg_lake_table/src/test/add_files_to_table.c
@@ -22,7 +22,7 @@
 
 #include "pg_lake/copy/copy_format.h"
 #include "pg_lake/data_file/data_files.h"
-#include "pg_lake/fdw/data_file_stats.h"
+#include "pg_lake/data_file/data_file_stats.h"
 #include "pg_lake/fdw/data_files_catalog.h"
 #include "pg_lake/fdw/partition_transform.h"
 #include "pg_lake/iceberg/catalog.h"
@@ -163,8 +163,7 @@ GenerateMetadataOperationList(Oid relationId, List *fileList, char *fileType)
 		{
 			int64		rowCount = GetRemoteParquetFileRowCount(filePath);
 
-
-			DataFileStats *dataFileStats = CreateDataFileStatsForTable(relationId, filePath, rowCount, 0, CONTENT_DATA);
+			DataFileStats *dataFileStats = CreateDataFileStatsForDataFile(filePath, rowCount, 0, NIL);
 
 			/* we don't support partitioned writes, and default spec id is 0 */
 			int32		partitionSpecId = 0;

From 5037045b4b8c02097c8d56cc78ed2b1078cbb572 Mon Sep 17 00:00:00 2001
From: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
Date: Fri, 9 Jan 2026 15:58:11 +0300
Subject: [PATCH 45/46] add assertion

Signed-off-by: Aykut Bozkurt <aykut.bozkurt@snowflake.com>
---
 pg_lake_engine/src/data_file/data_file_stats.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index e7a25b1e..77976f0c 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -20,6 +20,7 @@
 #include "executor/executor.h"
 #include "pg_lake/data_file/data_files.h"
 #include "pg_lake/data_file/data_file_stats.h"
+#include "pg_lake/extensions/pg_lake_engine.h"
 #include "pg_lake/extensions/postgis.h"
 #include "pg_lake/parsetree/options.h"
 #include "pg_lake/pgduck/client.h"
@@ -112,6 +113,19 @@ ExecuteCopyToCommandOnPGDuckConnection(char *copyCommand,
 			char	   *commandTuples = PQcmdTuples(result);
 			int64		totalRowCount = atoll(commandTuples);
 
+#ifdef USE_ASSERT_CHECKING
+			if (EnableHeavyAsserts)
+			{
+				List	   *remoteFiles = ListRemoteFileNames(destinationPath);
+
+				if (list_length(remoteFiles) != 1)
+				{
+					ereport(ERROR, (errmsg("expected exactly one file at %s, found %d files",
+										   destinationPath, list_length(remoteFiles))));
+				}
+			}
+#endif
+
 			DataFileStats *fileStats = CreateDataFileStatsForDataFile(destinationPath,
 																	  totalRowCount,
 																	  0,

From f1cf990af9f331b13e24365120b9a5f1898095e4 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
Date: Fri, 9 Jan 2026 16:29:04 +0300
Subject: [PATCH 46/46] minor improvements

Signed-off-by: Ahmet Gedemenli <ahmet.gedemenli@snowflake.com>
---
 pg_lake_engine/pg_lake_engine--3.0--3.1.sql        |  6 ++++--
 pg_lake_engine/src/data_file/data_file_stats.c     | 14 +++++++-------
 .../include/pg_lake/iceberg/iceberg_field.h        |  2 --
 pg_lake_iceberg/src/iceberg/iceberg_field.c        |  2 --
 pg_lake_iceberg/src/init.c                         |  2 ++
 pg_lake_table/src/fdw/writable_table.c             |  2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
index 1c178da7..5254d75e 100644
--- a/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
+++ b/pg_lake_engine/pg_lake_engine--3.0--3.1.sql
@@ -37,5 +37,7 @@ AS 'MODULE_PATHNAME', $function$pg_lake_internal_dummy_function$function$;
 -- Register map types, will be used for parsing DuckDB maps for COPY .. (return_stats)
 -- we prefer to create in the extension script to avoid concurrent attempts to create
 -- the same map, which may throw errors 
-SELECT map_type.create('TEXT','TEXT');
-SELECT map_type.create('TEXT','map_type.key_text_val_text');
+WITH text_text_map_name AS
+ (SELECT map_type.create('TEXT','TEXT') AS name)
+SELECT map_type.create('TEXT', name) AS text_map_of_text
+ FROM text_text_map_name;
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
index 77976f0c..a23c75e8 100644
--- a/pg_lake_engine/src/data_file/data_file_stats.c
+++ b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -35,7 +35,7 @@
 #include "utils/lsyscache.h"
 
 static void ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **maxs);
-static void ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs);
+static void ExtractMinMaxForAllColumns(Datum returnStatsMap, List **names, List **mins, List **maxs);
 static void ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins, List **maxs);
 static const char *UnescapeDoubleQuotes(const char *s);
 static List *GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields, DataFileSchema * schema);
@@ -271,7 +271,7 @@ ExtractMinMaxForColumn(Datum map, const char *colName, List **names, List **mins
 
 	if (minText != NULL && maxText != NULL)
 	{
-		*names = lappend(*names, pstrdup(colName));
+		*names = lappend(*names, colName);
 		*mins = lappend(*mins, minText);
 		*maxs = lappend(*maxs, maxText);
 	}
@@ -327,9 +327,9 @@ UnescapeDoubleQuotes(const char *s)
  * of type map(text,text).
  */
 static void
-ExtractMinMaxForAllColumns(Datum map, List **names, List **mins, List **maxs)
+ExtractMinMaxForAllColumns(Datum returnStatsMap, List **names, List **mins, List **maxs)
 {
-	ArrayType  *elementsArray = DatumGetArrayTypeP(map);
+	ArrayType  *elementsArray = DatumGetArrayTypeP(returnStatsMap);
 
 	if (elementsArray == NULL)
 		return;
@@ -393,7 +393,7 @@ ParseDuckdbColumnMinMaxFromText(char *input, List **names, List **mins, List **m
 
 	if (returnStatsMapId == InvalidOid)
 		ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						errmsg("unexpected return_stats result %s", input)));
+						errmsg("cannot find required map type for parsing return stats")));
 
 	/* parse result into map above */
 	Oid			typinput;
@@ -440,12 +440,12 @@ GetDataFileColumnStatsList(List *names, List *mins, List *maxs, List *leafFields
 
 		if (leafField == NULL)
 		{
-			ereport(DEBUG3, (errmsg("leaf field with id %d not found in leaf fields, skipping", fieldId)));
+			ereport(DEBUG3, (errmsg("leaf field with name %s not found in leaf fields, skipping", fieldName)));
 			continue;
 		}
 		else if (ShouldSkipStatistics(leafField))
 		{
-			ereport(DEBUG3, (errmsg("skipping statistics for field with id %d", fieldId)));
+			ereport(DEBUG3, (errmsg("skipping statistics for field with name %s", fieldName)));
 			continue;
 		}
 
diff --git a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
index ce5c73d5..1e499a88 100644
--- a/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
+++ b/pg_lake_iceberg/include/pg_lake/iceberg/iceberg_field.h
@@ -23,8 +23,6 @@
 #include "pg_lake/pgduck/type.h"
 #include "pg_lake/parquet/leaf_field.h"
 
-extern bool DeprecatedEnableStatsCollectionForNestedTypes;
-
 extern PGDLLEXPORT PGType IcebergFieldToPostgresType(Field * field);
 extern PGDLLEXPORT Field * PostgresTypeToIcebergField(PGType pgType,
 													  bool forAddColumn,
diff --git a/pg_lake_iceberg/src/iceberg/iceberg_field.c b/pg_lake_iceberg/src/iceberg/iceberg_field.c
index 6431f273..e4bc51dc 100644
--- a/pg_lake_iceberg/src/iceberg/iceberg_field.c
+++ b/pg_lake_iceberg/src/iceberg/iceberg_field.c
@@ -59,8 +59,6 @@
 #include "utils/rel.h"
 #include "utils/typcache.h"
 
-bool		DeprecatedEnableStatsCollectionForNestedTypes = false;
-
 typedef enum IcebergType
 {
 	ICEBERG_TYPE_INVALID,
diff --git a/pg_lake_iceberg/src/init.c b/pg_lake_iceberg/src/init.c
index c2c81be6..235a1233 100644
--- a/pg_lake_iceberg/src/init.c
+++ b/pg_lake_iceberg/src/init.c
@@ -48,6 +48,8 @@ int			IcebergAutovacuumNaptime = 10 * 60;
 /* managed via pg_lake_iceberg.log_autovacuum_min_duration, 10 minutes */
 int			IcebergAutovacuumLogMinDuration = 600000;
 
+static bool DeprecatedEnableStatsCollectionForNestedTypes;
+
 static bool IcebergDefaultLocationCheckHook(char **newvalue, void **extra, GucSource source);
 
 /* function declarations */
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
index 9138d1bd..ee8fb657 100644
--- a/pg_lake_table/src/fdw/writable_table.c
+++ b/pg_lake_table/src/fdw/writable_table.c
@@ -543,7 +543,7 @@ ApplyDeleteFile(Relation rel, char *sourcePath, int64 sourceRowCount, int64 live
 			Partition  *partition = GetDataFilePartition(relationId, transforms, sourcePath,
 														 &partitionSpecId);
 
-			Assert(statsCollector->dataFileStats != NIL);
+			Assert(list_length(statsCollector->dataFileStats) == 1);
 
 			/*
 			 * while deleting from parquet, we do not add file_size_bytes