patch: applied the skipExistingSegements patch for text index (#429)

kotharironak · web-flow · commit 25741ccc967e · 2025-06-12T13:15:46.000+05:30
* inprogress: patch for skipping text index building

* patch: applied the skipExistingSegements patch for text index

* nit: fixed suggested nits

* fix: spotless suggestions

* fixed checkstyle issues
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/column/PhysicalColumnIndexContainer.java
@@ -30,6 +30,7 @@
 import org.apache.pinot.segment.spi.index.IndexReaderFactory;
 import org.apache.pinot.segment.spi.index.IndexService;
 import org.apache.pinot.segment.spi.index.IndexType;
+import org.apache.pinot.segment.spi.index.StandardIndexes;
 import org.apache.pinot.segment.spi.index.column.ColumnIndexContainer;
 import org.apache.pinot.segment.spi.store.SegmentDirectory;
 import org.slf4j.Logger;
@@ -52,17 +53,36 @@ public PhysicalColumnIndexContainer(SegmentDirectory.Reader segmentReader, Colum
     }
 
     _readersByIndex = new HashMap<>();
+    Map<String, Map<String, String>> columnProperties = indexLoadingConfig.getColumnProperties();
     for (IndexType<?, ?, ?> indexType : IndexService.getInstance().getAllIndexes()) {
-      if (segmentReader.hasIndexFor(columnName, indexType)) {
-        IndexReaderFactory<?> readerProvider = indexType.getReaderFactory();
-        try {
-          IndexReader reader = readerProvider.createIndexReader(segmentReader, fieldIndexConfigs, metadata);
-          if (reader != null) {
-            _readersByIndex.put(indexType, reader);
-          }
-        } catch (IndexReaderConstraintException ex) {
-          LOGGER.warn("Constraint violation when indexing {} with {} index", columnName, indexType, ex);
+      boolean hasIndexFor = segmentReader.hasIndexFor(columnName, indexType);
+      if (!indexType.getId().equals(StandardIndexes.TEXT_ID)) {
+        // process all index types other than Text Index as-it-is
+        prepareIndexReader(segmentReader, indexType, fieldIndexConfigs, metadata);
+      } else if (IndexLoadingConfig.processExistingSegments(columnName, columnProperties) || hasIndexFor) {
+        // In case of Text Index, process segments only if property allows it OR text index exists on disk
+        prepareIndexReader(segmentReader, indexType, fieldIndexConfigs, metadata);
+      } else {
+        LOGGER.info("skipping index reader for segmentDir: {} for column: {} with skipExistingSegments.",
+                segmentReader.toSegmentDirectory().getIndexDir().toString(), columnName);
+      }
+    }
+  }
+
+  private void prepareIndexReader(SegmentDirectory.Reader segmentReader,
+                                  IndexType<?, ?, ?> indexType,
+                                  FieldIndexConfigs fieldIndexConfigs,
+                                  ColumnMetadata metadata) {
+    String columnName = metadata.getColumnName();
+    if (segmentReader.hasIndexFor(columnName, indexType)) {
+      IndexReaderFactory<?> readerProvider = indexType.getReaderFactory();
+      try {
+        IndexReader reader = readerProvider.createIndexReader(segmentReader, fieldIndexConfigs, metadata);
+        if (reader != null) {
+          _readersByIndex.put(indexType, reader);
         }
+      } catch (IndexReaderConstraintException | IOException ex) {
+        LOGGER.warn("Constraint violation when indexing {} with {} index", columnName, indexType, ex);
       }
     }
   }
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/IndexLoadingConfig.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/IndexLoadingConfig.java
@@ -398,4 +398,39 @@ public void addKnownColumns(Set<String> columns) {
     }
     _dirty = true;
   }
+
+  public Map<String, Map<String, String>> getColumnProperties() {
+    Map<String, Map<String, String>> columnProperties = new HashMap<>();
+    List<FieldConfig> fieldConfigs = _tableConfig.getFieldConfigList();
+    if (fieldConfigs != null) {
+      for (FieldConfig fieldConfig : fieldConfigs) {
+        columnProperties.put(fieldConfig.getName(), fieldConfig.getProperties());
+      }
+    }
+    return Map.of();
+  }
+
+  /**
+   * Helper methods to skip processing segments if the property SKIP_EXISTING_SEGMENTS is
+   * set to true in fieldConfigList.
+   *
+   * e.g
+   * "fieldConfigList":[
+   *   {
+   *      "name":"text_col_1",
+   *      "encodingType":"RAW",
+   *      "indexTypes": ["TEXT"],
+   *      "properties":{"fstType":"lucene", "skipExistingSegments":"true"}
+   *   }
+   *  ]
+   * */
+  public static boolean processExistingSegments(String columnName, Map<String, Map<String, String>> columnProperties) {
+    final String skipExistingSegments = "skipExistingSegments";
+    if (!columnProperties.containsKey(columnName)
+            || columnProperties.get(columnName) == null
+            || !columnProperties.get(columnName).containsKey(skipExistingSegments)) {
+      return true;
+    }
+    return !Boolean.parseBoolean(columnProperties.get(columnName).get(skipExistingSegments));
+  }
 }
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
@@ -38,13 +38,16 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import javax.annotation.Nullable;
 import org.apache.pinot.segment.local.segment.index.dictionary.DictionaryIndexType;
 import org.apache.pinot.segment.local.segment.index.forward.ForwardIndexType;
 import org.apache.pinot.segment.local.segment.index.loader.BaseIndexHandler;
+import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
 import org.apache.pinot.segment.local.segment.index.loader.SegmentPreProcessor;
 import org.apache.pinot.segment.spi.ColumnMetadata;
 import org.apache.pinot.segment.spi.creator.IndexCreationContext;
@@ -58,6 +61,7 @@
 import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext;
 import org.apache.pinot.segment.spi.store.SegmentDirectory;
 import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths;
+import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.TableConfig;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.slf4j.Logger;
@@ -86,12 +90,27 @@
 public class TextIndexHandler extends BaseIndexHandler {
   private static final Logger LOGGER = LoggerFactory.getLogger(TextIndexHandler.class);
 
+  private static final String SKIP_EXISTING_SEGMENTS = "skipExistingSegments";
+
   private final Set<String> _columnsToAddIdx;
+  private Map<String, Map<String, String>> _columnProperties = new HashMap<>();
 
   public TextIndexHandler(SegmentDirectory segmentDirectory, Map<String, FieldIndexConfigs> fieldIndexConfigs,
       @Nullable TableConfig tableConfig) {
     super(segmentDirectory, fieldIndexConfigs, tableConfig);
     _columnsToAddIdx = FieldIndexConfigsUtil.columnsWithIndexEnabled(StandardIndexes.text(), _fieldIndexConfigs);
+    prepareColumnProperties(tableConfig);
+  }
+
+  private void prepareColumnProperties(@Nullable TableConfig tableConfig) {
+    if (tableConfig != null) {
+      List<FieldConfig> fieldConfigList = tableConfig.getFieldConfigList();
+      if (fieldConfigList != null) {
+        for (FieldConfig fieldConfig : fieldConfigList) {
+          _columnProperties.put(fieldConfig.getName(), fieldConfig.getProperties());
+        }
+      }
+    }
   }
 
   @Override
@@ -134,7 +153,10 @@ public void updateIndices(SegmentDirectory.Writer segmentWriter)
     for (String column : columnsToAddIdx) {
       ColumnMetadata columnMetadata = _segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
       if (shouldCreateTextIndex(columnMetadata)) {
+        LOGGER.info("Creating text index from segment: {}, column: {}", segmentName, column);
         createTextIndexForColumn(segmentWriter, columnMetadata);
+      } else {
+        LOGGER.info("Skipping creation of text index from segment: {}, column: {}", segmentName, column);
       }
     }
   }
@@ -143,7 +165,8 @@ private boolean shouldCreateTextIndex(ColumnMetadata columnMetadata) {
     if (columnMetadata != null) {
       // Fail fast upon unsupported operations.
       checkUnsupportedOperationsForTextIndex(columnMetadata);
-      return true;
+      // skip creating text index if SKIP_EXISTING_SEGMENTS is set to true.
+      return IndexLoadingConfig.processExistingSegments(columnMetadata.getColumnName(), _columnProperties);
     }
     return false;
   }
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
@@ -680,6 +680,48 @@ public void testEnableTextIndexOnNewColumnDictEncoded(SegmentVersion segmentVers
         false, DataType.STRING, 100000);
   }
 
+  /**
+   * Test to check if text index creation skipped if SKIP_EXISTING_SEGMENTS set to true
+   * @throws Exception
+   */
+  @Test(dataProvider = "bothV1AndV3")
+  public void testSkipTextIndexCreationOnExistingSegmentForRawColumn(SegmentVersion segmentVersion)
+          throws Exception {
+    buildSegment(segmentVersion);
+
+    FieldConfig fieldConfig = new FieldConfig(EXISTING_STRING_COL_RAW,
+            FieldConfig.EncodingType.RAW,
+            FieldConfig.IndexType.TEXT,
+            null,
+            null,
+            null,
+            Map.of("skipExistingSegments", "true"));
+    _fieldConfigMap.put(EXISTING_STRING_COL_RAW, fieldConfig);
+    runPreProcessor(_schema);
+    validateIndexDoesNotExist(EXISTING_STRING_COL_RAW, StandardIndexes.text());
+  }
+
+  /**
+   * Test to check if text index creation skipped if SKIP_EXISTING_SEGMENTS set to false
+   * @throws Exception
+   */
+  @Test(dataProvider = "bothV1AndV3")
+  public void testDoNotSkipTextIndexCreationOnExistingSegmentForRawColumn(SegmentVersion segmentVersion)
+          throws Exception {
+    buildSegment(segmentVersion);
+
+    FieldConfig fieldConfig = new FieldConfig(EXISTING_STRING_COL_RAW,
+            FieldConfig.EncodingType.RAW,
+            FieldConfig.IndexType.TEXT,
+            null,
+            null,
+            null,
+            Map.of("skipExistingSegments", "false"));
+    _fieldConfigMap.put(EXISTING_STRING_COL_RAW, fieldConfig);
+    runPreProcessor(_schema);
+    validateIndexExists(EXISTING_STRING_COL_RAW, StandardIndexes.text());
+  }
+
   /**
    * Test to check text index creation during segment load after text index
    * creation is enabled on an existing raw column.