Skip to content

Commit f16bce0

Browse files
fix(glue): add glue.schema-columns property to skip StorageDescriptor columns
Introduces a new catalog property `glue.schema-columns` (default: `true`) that controls whether Iceberg schema columns are written to the Glue StorageDescriptor on table create and update operations. When set to `false`, the StorageDescriptor is populated with location only, avoiding the Glue API payload size limit that is hit for tables with very large schemas (e.g. ~3600 fields). Fixes #701
1 parent 52dbce5 commit f16bce0

File tree

2 files changed

+58
-12
lines changed

2 files changed

+58
-12
lines changed

catalog/glue/glue.go

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@ const (
6565
SkipArchive = "glue.skip-archive"
6666
SkipArchiveDefault = true
6767

68+
// GlueSchemaColumns controls whether schema columns are written to the Glue
69+
// StorageDescriptor on table create/update. Set to "false" to omit columns,
70+
// which avoids exceeding Glue's API payload limit for tables with very large
71+
// schemas. Note: disabling columns breaks Athena column discovery via Glue.
72+
GlueSchemaColumns = "glue.schema-columns"
73+
GlueSchemaColumnsDefault = true
74+
6875
AccessKeyID = "glue.access-key-id"
6976
SecretAccessKey = "glue.secret-access-key"
7077
SessionToken = "glue.session-token"
@@ -258,7 +265,7 @@ func (c *Catalog) CreateTable(ctx context.Context, identifier table.Identifier,
258265
_, err = c.glueSvc.CreateTable(ctx, &glue.CreateTableInput{
259266
CatalogId: c.catalogId,
260267
DatabaseName: aws.String(database),
261-
TableInput: constructTableInput(tableName, staged.Table, nil),
268+
TableInput: constructTableInput(tableName, staged.Table, nil, c.props.GetBool(GlueSchemaColumns, GlueSchemaColumnsDefault)),
262269
})
263270
if err != nil {
264271
return nil, fmt.Errorf("failed to create table %s.%s: %w", database, tableName, err)
@@ -290,7 +297,7 @@ func (c *Catalog) RegisterTable(ctx context.Context, identifier table.Identifier
290297
_, err = c.glueSvc.CreateTable(ctx, &glue.CreateTableInput{
291298
CatalogId: c.catalogId,
292299
DatabaseName: aws.String(database),
293-
TableInput: constructTableInput(tableName, tbl, nil),
300+
TableInput: constructTableInput(tableName, tbl, nil, c.props.GetBool(GlueSchemaColumns, GlueSchemaColumnsDefault)),
294301
})
295302
if err != nil {
296303
return nil, fmt.Errorf("failed to register table %s.%s: %w", database, tableName, err)
@@ -338,7 +345,7 @@ func (c *Catalog) CommitTable(ctx context.Context, identifier table.Identifier,
338345
_, err = c.glueSvc.UpdateTable(ctx, &glue.UpdateTableInput{
339346
CatalogId: c.catalogId,
340347
DatabaseName: aws.String(database),
341-
TableInput: constructTableInput(tableName, staged.Table, currentGlueTable),
348+
TableInput: constructTableInput(tableName, staged.Table, currentGlueTable, c.props.GetBool(GlueSchemaColumns, GlueSchemaColumnsDefault)),
342349
// use `VersionId` to implement optimistic locking
343350
VersionId: currentGlueTable.VersionId,
344351
SkipArchive: aws.Bool(c.props.GetBool(SkipArchive, SkipArchiveDefault)),
@@ -350,7 +357,7 @@ func (c *Catalog) CommitTable(ctx context.Context, identifier table.Identifier,
350357
_, err = c.glueSvc.CreateTable(ctx, &glue.CreateTableInput{
351358
CatalogId: c.catalogId,
352359
DatabaseName: aws.String(database),
353-
TableInput: constructTableInput(tableName, staged.Table, nil),
360+
TableInput: constructTableInput(tableName, staged.Table, nil, c.props.GetBool(GlueSchemaColumns, GlueSchemaColumnsDefault)),
354361
})
355362
if err != nil {
356363
return nil, "", err
@@ -761,15 +768,19 @@ func constructParameters(staged *table.Table, previousGlueTable *types.Table) ma
761768
return parameters
762769
}
763770

764-
func constructTableInput(tableName string, staged *table.Table, previousGlueTable *types.Table) *types.TableInput {
771+
func constructTableInput(tableName string, staged *table.Table, previousGlueTable *types.Table, includeColumns bool) *types.TableInput {
772+
sd := &types.StorageDescriptor{
773+
Location: aws.String(staged.Location()),
774+
}
775+
if includeColumns {
776+
sd.Columns = schemasToGlueColumns(staged.Metadata())
777+
}
778+
765779
tableInput := &types.TableInput{
766-
Name: aws.String(tableName),
767-
TableType: aws.String(glueTableType),
768-
Parameters: constructParameters(staged, previousGlueTable),
769-
StorageDescriptor: &types.StorageDescriptor{
770-
Location: aws.String(staged.Location()),
771-
Columns: schemasToGlueColumns(staged.Metadata()),
772-
},
780+
Name: aws.String(tableName),
781+
TableType: aws.String(glueTableType),
782+
Parameters: constructParameters(staged, previousGlueTable),
783+
StorageDescriptor: sd,
773784
}
774785

775786
if comment, ok := staged.Properties()[PropsKeyDescription]; ok {

catalog/glue/glue_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,6 +1296,41 @@ func TestGlueCheckTableNotExists(t *testing.T) {
12961296
assert.False(exists)
12971297
}
12981298

1299+
func TestConstructTableInputSchemaColumns(t *testing.T) {
1300+
schema := iceberg.NewSchemaWithIdentifiers(1, []int{1},
1301+
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.Int64Type{}, Required: true},
1302+
iceberg.NestedField{ID: 2, Name: "data", Type: iceberg.StringType{}},
1303+
)
1304+
1305+
builder, err := table.NewMetadataBuilder(2)
1306+
require.NoError(t, err)
1307+
require.NoError(t, builder.SetLoc("s3://bucket/db/tbl"))
1308+
require.NoError(t, builder.AddSchema(schema))
1309+
require.NoError(t, builder.SetCurrentSchemaID(1))
1310+
require.NoError(t, builder.AddPartitionSpec(iceberg.UnpartitionedSpec, true))
1311+
require.NoError(t, builder.SetDefaultSpecID(iceberg.UnpartitionedSpec.ID()))
1312+
require.NoError(t, builder.AddSortOrder(&table.UnsortedSortOrder))
1313+
require.NoError(t, builder.SetDefaultSortOrderID(table.UnsortedSortOrderID))
1314+
meta, err := builder.Build()
1315+
require.NoError(t, err)
1316+
1317+
tbl := table.New([]string{"db", "tbl"}, meta, "s3://bucket/db/tbl/metadata/v1.json", nil, nil)
1318+
1319+
t.Run("columns included by default", func(t *testing.T) {
1320+
input := constructTableInput("tbl", tbl, nil, true)
1321+
require.NotNil(t, input.StorageDescriptor)
1322+
assert := require.New(t)
1323+
assert.Len(input.StorageDescriptor.Columns, 2)
1324+
})
1325+
1326+
t.Run("columns omitted when disabled", func(t *testing.T) {
1327+
input := constructTableInput("tbl", tbl, nil, false)
1328+
require.NotNil(t, input.StorageDescriptor)
1329+
assert := require.New(t)
1330+
assert.Empty(input.StorageDescriptor.Columns)
1331+
})
1332+
}
1333+
12991334
func cleanupTable(t *testing.T, ctlg catalog.Catalog, tbIdent table.Identifier, awsCfg aws.Config) {
13001335
t.Helper()
13011336

0 commit comments

Comments
 (0)