remove explicit reference of 32767 limit value and fix unit test

yeya24 · yeya24 · commit 0d469a3add80 · 2026-01-06T09:46:51.000-08:00
Signed-off-by: yeya24 &lt;benye@amazon.com&gt;
diff --git a/convert/convert.go b/convert/convert.go
@@ -55,7 +55,7 @@ var DefaultConvertOpts = convertOpts{
 	readConcurrency:    runtime.GOMAXPROCS(0),
 	writeConcurrency:   1,
 	maxSamplesPerChunk: tsdb.DefaultSamplesPerChunk,
-	maxNumColumns:      parquet.MaxColumnIndex, // 32767 - max column index supported by parquet-go
+	maxNumColumns:      parquet.MaxColumnIndex, // max column index supported by parquet-go
 }
 
 type Convertible interface {
@@ -283,7 +283,7 @@ func WithMaxSamplesPerChunk(samplesPerChunk int) ConvertOption {
 }
 
 // WithMaxNumColumns sets the maximum number of columns allowed in a Parquet file.
-// Parquet has a limit of approximately 32767 columns (MaxInt16). When this limit is exceeded,
+// Parquet-go library has a limit of max column index supported. When this limit is exceeded,
 // the conversion will automatically shard the data into multiple files. This option allows
 // users to control the number of columns in the converted parquet file.
 //
@@ -292,7 +292,7 @@ func WithMaxSamplesPerChunk(samplesPerChunk int) ConvertOption {
 // 998 unique label names can be included in a single shard.
 //
 // Parameters:
-//   - maxColumns: Maximum number of columns per Parquet file, including system columns (default: 32767)
+//   - maxColumns: Maximum number of columns per Parquet file, including system columns
 //
 // Example:
 //
@@ -492,7 +492,8 @@ func singleTSDBRowReader(
 	}
 
 	// If total unique label names exceed the limit, we need to shard based only on column limits.
-	if len(allLabelNames)+systemColumns >= opts.maxNumColumns {
+	// Equality is allowed (exactly maxNumColumns columns is fine).
+	if len(allLabelNames)+systemColumns > opts.maxNumColumns {
 		indexReaders := make([]blockIndexReader, len(blocks))
 		defer func() {
 			for _, indexReader := range indexReaders {
@@ -808,12 +809,12 @@ func shardSeries(
 
 			// Create a new shard if:
 			// 1. Row-based sharding is enabled AND the row limit is reached, OR
-			// 2. Adding this series would exceed the column limit
+			// 2. Adding this series would exceed the column limit (equality is allowed)
 			shouldCreateNewShard := false
 			if opts.numRowGroups != math.MaxInt32 && uniqueCount >= rowsPerShard {
 				shouldCreateNewShard = true
 			}
-			if len(labelColumns)+newLabelCount+systemColumns >= opts.maxNumColumns {
+			if len(labelColumns)+newLabelCount+systemColumns > opts.maxNumColumns {
 				shouldCreateNewShard = true
 			}
 
diff --git a/convert/convert_test.go b/convert/convert_test.go
@@ -904,36 +904,36 @@ func Test_TooManyColumns(t *testing.T) {
 			name:             "with_numRowGroups_2_shards",
 			withNumRowGroups: true,
 			description:      "Uses shardedTSDBRowReaders path when numRowGroups is set, creates 2 shards",
-			maxNumColumns:    1000, // 998 label columns + 2 system columns
-			uniqueLabelNames: 1200, // Exceeds maxNumColumns to trigger sharding
-			labelsPerSeries:  200,  // Each series will have 200 unique labels (plus __name__)
+			maxNumColumns:    100, // 98 label columns + 2 system columns
+			uniqueLabelNames: 150, // Exceeds maxNumColumns to trigger sharding
+			labelsPerSeries:  50,  // Each series will have 50 unique labels (plus __name__)
 			minShards:        2,
 		},
 		{
 			name:             "without_numRowGroups_2_shards",
 			withNumRowGroups: false,
 			description:      "Uses singleTSDBRowReader path when numRowGroups is not set, creates 2 shards",
-			maxNumColumns:    1000, // 998 label columns + 2 system columns
-			uniqueLabelNames: 1200, // Exceeds maxNumColumns to trigger sharding
-			labelsPerSeries:  200,  // Each series will have 200 unique labels (plus __name__)
+			maxNumColumns:    100, // 98 label columns + 2 system columns
+			uniqueLabelNames: 150, // Exceeds maxNumColumns to trigger sharding
+			labelsPerSeries:  50,  // Each series will have 50 unique labels (plus __name__)
 			minShards:        2,
 		},
 		{
 			name:             "with_numRowGroups_3_shards",
 			withNumRowGroups: true,
 			description:      "Uses shardedTSDBRowReaders path when numRowGroups is set, creates 3+ shards",
-			maxNumColumns:    1000, // 998 label columns + 2 system columns
-			uniqueLabelNames: 2500, // Will require at least 3 shards (2500 / 998 ≈ 2.5)
-			labelsPerSeries:  300,  // Each series will have 300 unique labels (plus __name__)
+			maxNumColumns:    100, // 98 label columns + 2 system columns
+			uniqueLabelNames: 250, // Will require at least 3 shards (250 / 98 ≈ 2.55)
+			labelsPerSeries:  80,  // Each series will have 80 unique labels (plus __name__)
 			minShards:        3,
 		},
 		{
 			name:             "without_numRowGroups_3_shards",
 			withNumRowGroups: false,
 			description:      "Uses singleTSDBRowReader path when numRowGroups is not set, creates 3+ shards",
-			maxNumColumns:    1000, // 998 label columns + 2 system columns
-			uniqueLabelNames: 2500, // Will require at least 3 shards (2500 / 998 ≈ 2.5)
-			labelsPerSeries:  300,  // Each series will have 300 unique labels (plus __name__)
+			maxNumColumns:    100, // 98 label columns + 2 system columns
+			uniqueLabelNames: 250, // Will require at least 3 shards (250 / 98 ≈ 2.55)
+			labelsPerSeries:  80,  // Each series will have 80 unique labels (plus __name__)
 			minShards:        3,
 		},
 	}
@@ -1038,8 +1038,18 @@ func rowToSeries(t *testing.T, s *parquet.Schema, dec *schema.PrometheusParquetC
 			col := cols[colIdx][0]
 			label, ok := schema.ExtractLabelFromColumn(col)
 			if ok {
+				// Only include label columns that have actual values (not null/empty)
+				// This matches what's stored in s_col_indexes - only labels present in the series
+				if colVal.IsNull() {
+					continue
+				}
 				b.Add(label, colVal.String())
-				foundLblsIdxs = append(foundLblsIdxs, colIdx)
+				// Look up the ColumnIndex from the schema (same as when writing)
+				lc, ok := s.Lookup(col)
+				if !ok {
+					return nil, nil, fmt.Errorf("column %s not found in schema", col)
+				}
+				foundLblsIdxs = append(foundLblsIdxs, lc.ColumnIndex)
 			}
 
 			if schema.IsDataColumn(col) && dec != nil {