Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
bbc924d
WIP: Use return_stats option to collect column statistics
sfc-gh-agedemenli Dec 11, 2025
92593f3
Duckdb patch return_stats
sfc-gh-abozkurt Dec 16, 2025
068455c
Fix null schema
sfc-gh-agedemenli Dec 18, 2025
a572d54
Null check for min/max values
sfc-gh-agedemenli Dec 18, 2025
0f16944
Skip statistics for some types
sfc-gh-agedemenli Dec 18, 2025
9f206da
Add schema==NULL check for column stats
sfc-gh-agedemenli Dec 22, 2025
85bdfb8
Fallback to previous mechanism it stats are null
sfc-gh-agedemenli Dec 22, 2025
8b12b80
Fix: Make list from file stats
sfc-gh-agedemenli Dec 22, 2025
19185f3
Skip tests for nested fields
sfc-gh-agedemenli Dec 23, 2025
36c2048
Do not use enable stats guc
sfc-gh-agedemenli Dec 23, 2025
393f6ea
fixup
sfc-gh-agedemenli Dec 23, 2025
f95d58e
parse return_stats output to map type
sfc-gh-abozkurt Dec 23, 2025
135672f
Add map type to parse duckdb result
sfc-gh-agedemenli Dec 23, 2025
adc97a6
Remove unnecessary deepcopy for stats
sfc-gh-agedemenli Dec 30, 2025
94cb814
Add comments
sfc-gh-agedemenli Dec 30, 2025
d1f5a89
Minor
sfc-gh-agedemenli Dec 30, 2025
a93d356
Use names from file stats instead of ListRemoteFileNames
sfc-gh-agedemenli Jan 5, 2026
dec8b2d
Minor improvements
sfc-gh-agedemenli Jan 5, 2026
1d8ad4c
Rename FindGeneratedDataFiles to GetNewFileOpsFromFileStats
sfc-gh-agedemenli Jan 5, 2026
4436d09
Add struct ColumnStatsCollector
sfc-gh-agedemenli Jan 6, 2026
96e2162
Rewrite GetDataFileColumnStatsList, add helpers and logs
sfc-gh-agedemenli Jan 6, 2026
6fde32d
Use ColumnStatsCollector in PerformDeleteFromParquet
sfc-gh-agedemenli Jan 6, 2026
1b1bc05
Minor rename variable
sfc-gh-agedemenli Jan 6, 2026
8c0e688
Move FindLeafField to engine
sfc-gh-agedemenli Jan 7, 2026
994cc33
Move ShouldSkipStatistics to engine
sfc-gh-agedemenli Jan 7, 2026
78c536a
Add leaf_field.c
sfc-gh-agedemenli Jan 7, 2026
40ec785
Comment
sfc-gh-agedemenli Jan 7, 2026
1810013
fix reference stats list
sfc-gh-agedemenli Jan 7, 2026
2721262
fixup
sfc-gh-agedemenli Jan 7, 2026
076e761
Comment
sfc-gh-agedemenli Jan 7, 2026
0db1368
Get rid of redundant string duplication
sfc-gh-agedemenli Jan 7, 2026
5276ba3
Use the collector as return type
sfc-gh-agedemenli Jan 7, 2026
4c8a83e
fixup
sfc-gh-agedemenli Jan 7, 2026
3b9a2df
fixup
sfc-gh-agedemenli Jan 8, 2026
fad3ea5
Move CreateDataFileStatsForTable
sfc-gh-agedemenli Jan 8, 2026
f7b993a
Add ExecuteCopyCommandOnPGDuckConnection
sfc-gh-agedemenli Jan 8, 2026
cfb770c
Handle modification in case stats is empty
sfc-gh-agedemenli Jan 8, 2026
5b297b1
Move stats related logic to new file: data_file_stats.c
sfc-gh-agedemenli Jan 8, 2026
887ac71
Move field&leaf field functions
sfc-gh-agedemenli Jan 8, 2026
2db85e2
Remove unnecessary includes and whitespaces
sfc-gh-agedemenli Jan 8, 2026
5647b47
Use returned stats for deleted files
sfc-gh-agedemenli Jan 9, 2026
e3b51e7
Rename ColumnStatsCollector to StatsCollector
sfc-gh-agedemenli Jan 9, 2026
fa8ae41
Reindent
sfc-gh-agedemenli Jan 9, 2026
699eb3f
generate stats for all files at WriteQueryResultTo
sfc-gh-abozkurt Jan 9, 2026
5037045
add assertion
sfc-gh-abozkurt Jan 9, 2026
f1cf990
minor improvements
sfc-gh-agedemenli Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pg_lake_copy/src/copy/copy.c
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ ProcessPgLakeCopyTo(CopyStmt *copyStmt, ParseState *pstate, Relation relation,
*/
ConvertCSVFileTo(tempCSVPath, tupleDesc, maximumLineLength,
destinationPath, destinationFormat, destinationCompression,
copyStmt->options, schema);
copyStmt->options, schema, NIL);

if (IsCopyToStdout(copyStmt))
{
Expand Down
55 changes: 55 additions & 0 deletions pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,36 @@
#include "datatype/timestamp.h"

#include "pg_lake/parquet/leaf_field.h"
#include "pg_lake/pgduck/client.h"


/*
* ColumnStatsMode describes the mode of column stats.
* - When truncate mode (default) is used, the column stats are truncated
* to the given length.
* - When none mode is used, the column stats are not collected.
*/
typedef enum ColumnStatsMode
{
COLUMN_STATS_MODE_TRUNCATE = 0,
COLUMN_STATS_MODE_NONE = 1,
} ColumnStatsMode;

/*
* ColumnStatsConfig describes the configuration for column stats.
* - mode: the mode of column stats.
* - truncateLen: the length to truncate the column stats in truncate mode.
*/
typedef struct ColumnStatsConfig
{
ColumnStatsMode mode;

/* used for truncate mode */
size_t truncateLen;
} ColumnStatsConfig;




/*
* DataFileColumnStats stores column statistics for a data file.
Expand All @@ -43,6 +73,8 @@ typedef struct DataFileColumnStats
*/
typedef struct DataFileStats
{
char *dataFilePath;

/* number of bytes in the file */
int64 fileSize;

Expand All @@ -61,3 +93,26 @@ typedef struct DataFileStats
/* for a new data file with row IDs, the start of the range */
int64 rowIdStart;
} DataFileStats;

typedef struct StatsCollector
{
int64 totalRowCount;
List *dataFileStats;
} StatsCollector;

extern PGDLLEXPORT DataFileStats * DeepCopyDataFileStats(const DataFileStats * stats);
extern PGDLLEXPORT StatsCollector * GetDataFileStatsListFromPGResult(PGresult *result,
List *leafFields,
DataFileSchema * schema);
extern PGDLLEXPORT StatsCollector * ExecuteCopyToCommandOnPGDuckConnection(char *copyCommand,
List *leafFields,
DataFileSchema * schema,
bool disablePreserveInsertionOrder,
char *destinationPath,
CopyDataFormat destinationFormat);
extern PGDLLEXPORT bool ShouldSkipStatistics(LeafField * leafField);
extern PGDLLEXPORT DataFileStats * CreateDataFileStatsForDataFile(char *dataFilePath,
int64 rowCount, int64 deletedRowCount,
List *leafFields);
extern PGDLLEXPORT void ApplyColumnStatsModeForAllFileStats(Oid relationId, List *dataFileStats);
extern PGDLLEXPORT List *GetRemoteParquetColumnStats(char *path, List *leafFields);
3 changes: 3 additions & 0 deletions pg_lake_engine/include/pg_lake/parquet/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#pragma once

#include "nodes/pg_list.h"
#include "pg_lake/pgduck/type.h"

/*
* Reserved _row_id field ID used for Iceberg
Expand Down Expand Up @@ -154,3 +155,5 @@ typedef FieldStruct DataFileSchema;
typedef FieldStructElement DataFileSchemaField;

extern PGDLLEXPORT DataFileSchema * DeepCopyDataFileSchema(const DataFileSchema * schema);
extern PGDLLEXPORT Field * DeepCopyField(const Field * field);
extern PGDLLEXPORT bool PGTypeRequiresConversionToIcebergString(Field * field, PGType pgType);
2 changes: 2 additions & 0 deletions pg_lake_engine/include/pg_lake/parquet/leaf_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ typedef struct LeafField

extern PGDLLEXPORT int LeafFieldCompare(const ListCell *a, const ListCell *b);
extern PGDLLEXPORT bool SchemaFieldsEquivalent(DataFileSchemaField * fieldA, DataFileSchemaField * fieldB);
extern PGDLLEXPORT LeafField DeepCopyLeafField(const LeafField * leafField);
extern PGDLLEXPORT LeafField * FindLeafField(List *leafFieldList, int fieldId);
#if PG_VERSION_NUM < 170000
extern PGDLLEXPORT int pg_cmp_s32(int32 a, int32 b);
#endif
16 changes: 9 additions & 7 deletions pg_lake_engine/include/pg_lake/pgduck/delete_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
#include "pg_lake/copy/copy_format.h"
#include "pg_lake/parquet/field.h"
#include "pg_lake/pgduck/read_data.h"
#include "pg_lake/data_file/data_file_stats.h"

extern PGDLLEXPORT void PerformDeleteFromParquet(char *sourceDataFilePath,
List *positionDeleteFiles,
char *deletionFilePath,
char *destinationPath,
CopyDataCompression destinationCompression,
DataFileSchema * schema,
ReadDataStats * stats);
extern PGDLLEXPORT StatsCollector * PerformDeleteFromParquet(char *sourceDataFilePath,
List *positionDeleteFiles,
char *deletionFilePath,
char *destinationPath,
CopyDataCompression destinationCompression,
DataFileSchema * schema,
ReadDataStats * stats,
List *leafFields);
35 changes: 19 additions & 16 deletions pg_lake_engine/include/pg_lake/pgduck/write_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "access/tupdesc.h"
#include "pg_lake/copy/copy_format.h"
#include "pg_lake/data_file/data_file_stats.h"
#include "pg_lake/parquet/field.h"
#include "nodes/pg_list.h"

Expand All @@ -35,20 +36,22 @@ typedef enum ParquetVersion
/* pg_lake_table.default_parquet_version */
extern PGDLLEXPORT int DefaultParquetVersion;

extern PGDLLEXPORT void ConvertCSVFileTo(char *csvFilePath,
TupleDesc tupleDesc,
int maxLineSize,
char *destinationPath,
CopyDataFormat destinationFormat,
CopyDataCompression destinationCompression,
List *formatOptions,
DataFileSchema * schema);
extern PGDLLEXPORT int64 WriteQueryResultTo(char *query,
char *destinationPath,
CopyDataFormat destinationFormat,
CopyDataCompression destinationCompression,
List *formatOptions,
bool queryHasRowId,
DataFileSchema * schema,
TupleDesc queryTupleDesc);
extern PGDLLEXPORT StatsCollector * ConvertCSVFileTo(char *csvFilePath,
TupleDesc tupleDesc,
int maxLineSize,
char *destinationPath,
CopyDataFormat destinationFormat,
CopyDataCompression destinationCompression,
List *formatOptions,
DataFileSchema * schema,
List *leafFields);
extern PGDLLEXPORT StatsCollector * WriteQueryResultTo(char *query,
char *destinationPath,
CopyDataFormat destinationFormat,
CopyDataCompression destinationCompression,
List *formatOptions,
bool queryHasRowId,
DataFileSchema * schema,
TupleDesc queryTupleDesc,
List *leafFields);
extern PGDLLEXPORT void AppendFields(StringInfo map, DataFileSchema * schema);
8 changes: 8 additions & 0 deletions pg_lake_engine/pg_lake_engine--3.0--3.1.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,11 @@ CREATE FUNCTION __lake__internal__nsp__.from_hex(text)
LANGUAGE C
IMMUTABLE PARALLEL SAFE STRICT
AS 'MODULE_PATHNAME', $function$pg_lake_internal_dummy_function$function$;

-- Register map types, will be used for parsing DuckDB maps for COPY .. (return_stats)
-- we prefer to create in the extension script to avoid concurrent attempts to create
-- the same map, which may throw errors
WITH text_text_map_name AS
(SELECT map_type.create('TEXT','TEXT') AS name)
SELECT map_type.create('TEXT', name) AS text_map_of_text
FROM text_text_map_name;
Loading