@@ -893,6 +893,289 @@ Monitoring and Logging
893893 slow_query_threshold = 1.0 # Log queries > 1 second
894894 )
895895
896+ Index Maintenance Best Practices
897+ ----------------------------------
898+
899+ ⭐ **Critical for Production **: Regular index maintenance ensures optimal performance, especially for vector indexes.
900+
901+ IVF Index Creation Timing
902+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
903+
904+ .. important ::
905+ **Critical Issue: Index Creation Timing **
906+
907+ IVF indexes should be created **AFTER ** inserting initial data for optimal clustering:
908+
909+ .. code-block :: python
910+
911+ # ✅ CORRECT ORDER:
912+ client.create_table(Document)
913+ client.batch_insert(Document, initial_data) # Insert first
914+ client.vector_ops.create_ivf(" documents" , " idx" , " embedding" , lists = 50 ) # Index last
915+
916+ # Then continue normal operations
917+ client.insert(Document, new_doc) # ✅ IVF supports dynamic updates
918+
919+ .. code-block :: python
920+
921+ # ❌ AVOID: Creating index on empty table
922+ client.create_table(Document)
923+ client.vector_ops.create_ivf(" documents" , " idx" , " embedding" , lists = 50 )
924+ client.batch_insert(Document, data) # Poor initial clustering
925+
926+ **Why? ** Initial data helps IVF algorithm create better balanced clusters.
927+
928+ **Key Difference from HNSW **:
929+
930+ * **IVF **: Insert data → Create index → Continue updates ✅ (dynamic)
931+ * **HNSW **: Insert ALL data → Create index → Read-only 🚧 (static, updates coming soon)
932+
933+ IVF Index Health Monitoring
934+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
935+
936+ .. code-block :: python
937+
938+ import math
939+ from datetime import datetime
940+
941+ def monitor_ivf_health (client , table_name , column_name , expected_lists ):
942+ """
943+ Monitor IVF index health - CRITICAL for production vector search.
944+
945+ Args:
946+ client: MatrixOne client
947+ table_name: Table with IVF index
948+ column_name: Vector column name
949+ expected_lists: Expected number of centroids
950+ """
951+ # ✅ GOOD: Get comprehensive IVF statistics
952+ stats = client.vector_ops.get_ivf_stats(table_name, column_name)
953+
954+ distribution = stats[' distribution' ]
955+ centroid_counts = distribution[' centroid_count' ]
956+
957+ # Calculate health metrics
958+ total_centroids = len (centroid_counts)
959+ total_vectors = sum (centroid_counts)
960+ min_count = min (centroid_counts) if centroid_counts else 0
961+ max_count = max (centroid_counts) if centroid_counts else 0
962+ avg_count = total_vectors / total_centroids if total_centroids > 0 else 0
963+
964+ # ⭐ KEY METRIC: Balance ratio
965+ balance_ratio = max_count / min_count if min_count > 0 else float (' inf' )
966+
967+ # Health assessment
968+ print (f " \n { ' =' * 60 } " )
969+ print (f " IVF Health Report - { table_name} . { column_name} " )
970+ print (f " Timestamp: { datetime.now().isoformat()} " )
971+ print (f " { ' =' * 60 } " )
972+ print (f " Total Centroids: { total_centroids} (expected: { expected_lists} ) " )
973+ print (f " Total Vectors: { total_vectors} " )
974+ print (f " Avg/Centroid: { avg_count:.2f } " )
975+ print (f " Balance Ratio: { balance_ratio:.2f } " )
976+
977+ # Status assessment (threshold: <2.0 good, >2.5 rebuild)
978+ if balance_ratio < 2.0 :
979+ status = " ✅ HEALTHY"
980+ action = " Continue monitoring"
981+ elif balance_ratio < 2.5 :
982+ status = " ⚠️ FAIR"
983+ action = " Plan rebuild"
984+ else :
985+ status = " ❌ CRITICAL"
986+ action = " Rebuild immediately"
987+
988+ print (f " Status: { status} " )
989+ print (f " Action: { action} " )
990+ print (f " { ' =' * 60 } \n " )
991+
992+ return {
993+ ' balance_ratio' : balance_ratio,
994+ ' total_vectors' : total_vectors,
995+ ' status' : status,
996+ ' action' : action
997+ }
998+
999+ # ✅ GOOD: Regular health checks (schedule daily/weekly)
1000+ health = monitor_ivf_health(
1001+ client,
1002+ " documents" ,
1003+ " embedding" ,
1004+ expected_lists = 100
1005+ )
1006+
1007+ # ✅ GOOD: Automated alerting
1008+ if health[' balance_ratio' ] > 2.5 :
1009+ # Send alert (email, Slack, PagerDuty, etc.)
1010+ print (f " 🚨 ALERT: Index needs attention! Balance ratio: { health[' balance_ratio' ]:.2f } " )
1011+
1012+ IVF Index Rebuild Strategy
1013+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
1014+
1015+ .. code-block :: python
1016+
1017+ def rebuild_ivf_index (client , table_name , column_name , index_name ):
1018+ """
1019+ Rebuild IVF index with optimal parameters.
1020+
1021+ When to rebuild:
1022+ - Balance ratio > 2.5
1023+ - After bulk inserts (>20% new data)
1024+ - Query performance degradation
1025+ - After major deletes or updates
1026+ """
1027+ print (f " Rebuilding IVF index: { table_name} . { column_name} " )
1028+
1029+ # ✅ GOOD: Get current stats before rebuild
1030+ old_stats = client.vector_ops.get_ivf_stats(table_name, column_name)
1031+ old_counts = old_stats[' distribution' ][' centroid_count' ]
1032+ total_vectors = sum (old_counts)
1033+ old_balance = max (old_counts) / min (old_counts) if min (old_counts) > 0 else float (' inf' )
1034+
1035+ print (f " Old stats: { total_vectors} vectors, balance { old_balance:.2f } " )
1036+
1037+ # ✅ GOOD: Calculate optimal lists parameter
1038+ # Rule: lists = √N to 4×√N (where N = total vectors)
1039+ optimal_lists = int (math.sqrt(total_vectors) * 2 ) # Using 2×√N
1040+ optimal_lists = max (10 , min (optimal_lists, 1000 )) # Clamp between 10-1000
1041+
1042+ print (f " Calculated optimal lists: { optimal_lists} " )
1043+
1044+ # ✅ GOOD: Drop and recreate index
1045+ try :
1046+ # Drop old index
1047+ client.vector_ops.drop(table_name, index_name)
1048+ print (f " ✓ Dropped old index " )
1049+
1050+ # Recreate with optimal parameters
1051+ client.vector_ops.create_ivf(
1052+ table_name,
1053+ name = index_name,
1054+ column = column_name,
1055+ lists = optimal_lists,
1056+ op_type = " vector_l2_ops"
1057+ )
1058+ print (f " ✓ Created new index with { optimal_lists} lists " )
1059+
1060+ # ✅ GOOD: Verify new index health
1061+ import time
1062+ time.sleep(2 ) # Give index time to stabilize
1063+
1064+ new_stats = client.vector_ops.get_ivf_stats(table_name, column_name)
1065+ new_counts = new_stats[' distribution' ][' centroid_count' ]
1066+ new_balance = max (new_counts) / min (new_counts) if min (new_counts) > 0 else float (' inf' )
1067+
1068+ improvement = ((old_balance - new_balance) / old_balance * 100 )
1069+
1070+ print (f " \n Rebuild Results: " )
1071+ print (f " Old balance: { old_balance:.2f } " )
1072+ print (f " New balance: { new_balance:.2f } " )
1073+ print (f " Improvement: { improvement:.1f } % " )
1074+
1075+ if new_balance < 2.0 :
1076+ print (f " ✅ Index is now healthy! " )
1077+ else :
1078+ print (f " ⚠️ Consider adjusting lists parameter " )
1079+
1080+ except Exception as e:
1081+ print (f " ❌ Rebuild failed: { e} " )
1082+ raise
1083+
1084+ # Usage in production
1085+ # ✅ GOOD: Schedule during low-traffic periods
1086+ # ✅ GOOD: Check health first, rebuild only if needed
1087+ health = monitor_ivf_health(client, " documents" , " embedding" , expected_lists = 100 )
1088+ if health[' balance_ratio' ] > 2.5 :
1089+ rebuild_ivf_index(client, " documents" , " embedding" , " idx_embedding_ivf" )
1090+
1091+ IVF Index Parameter Selection
1092+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1093+
1094+ .. code-block :: python
1095+
1096+ import math
1097+
1098+ # ✅ GOOD: Calculate optimal lists (guideline: <1K: 10-20, 1K-100K: 50-200, >100K: √N to 4×√N)
1099+ total_vectors = 50000
1100+ optimal_lists = int (math.sqrt(total_vectors) * 2 ) # Using 2×√N = ~316 lists
1101+
1102+ client.vector_ops.create_ivf(
1103+ " large_table" ,
1104+ name = " idx_vectors" ,
1105+ column = " embedding" ,
1106+ lists = optimal_lists,
1107+ op_type = " vector_l2_ops"
1108+ )
1109+
1110+ Fulltext Index Maintenance
1111+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1112+
1113+ .. code-block :: python
1114+
1115+ from matrixone import FulltextParserType
1116+
1117+ # ✅ GOOD: BM25 for most cases, choose parser by content type
1118+ client.fulltext_index.create(" articles" , " idx_content" , [" title" , " content" ], algorithm = " BM25" )
1119+
1120+ # For Chinese: NGRAM parser
1121+ client.fulltext_index.create(" chinese_docs" , " idx_cn" , " content" , algorithm = " BM25" ,
1122+ parser = FulltextParserType.NGRAM )
1123+
1124+ # For JSON: JSON parser (indexes values, not keys)
1125+ client.fulltext_index.create(" json_docs" , " idx_json" , " data" , algorithm = " BM25" ,
1126+ parser = FulltextParserType.JSON )
1127+
1128+ HNSW Index Considerations
1129+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
1130+
1131+ .. code-block :: python
1132+
1133+ from sqlalchemy import BigInteger, Column
1134+ from matrixone.sqlalchemy_ext import create_vector_column
1135+
1136+ # ✅ GOOD: HNSW requires BigInteger primary key
1137+ class Document (Base ):
1138+ __tablename__ = ' documents'
1139+ id = Column(BigInteger, primary_key = True ) # Must be BigInteger
1140+ embedding = create_vector_column(128 , ' f32' )
1141+
1142+ # ✅ GOOD: Current workflow
1143+ client.create_table(Document)
1144+ client.batch_insert(Document, all_documents) # Insert data first
1145+
1146+ client.vector_ops.enable_hnsw()
1147+ client.vector_ops.create_hnsw(Document, " idx_embedding" , " embedding" , m = 16 )
1148+
1149+ # 🚧 Coming Soon: Dynamic updates after index creation
1150+ # Current workaround: Drop index → Modify data → Recreate index
1151+
1152+ Batch Operation Size Optimization
1153+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1154+
1155+ .. code-block :: python
1156+
1157+ # ✅ GOOD: Optimal batch sizes for different operations
1158+
1159+ # For inserts: 1000-10000 rows per batch
1160+ batch_size = 5000
1161+ for i in range (0 , len (large_dataset), batch_size):
1162+ batch = large_dataset[i:i + batch_size]
1163+ client.batch_insert(" table_name" , batch)
1164+ print (f " Inserted batch { i// batch_size + 1 } " )
1165+
1166+ # For vector data: smaller batches (vectors are larger)
1167+ vector_batch_size = 1000
1168+ for i in range (0 , len (vector_data), vector_batch_size):
1169+ batch = vector_data[i:i + vector_batch_size]
1170+ client.batch_insert(" vectors_table" , batch)
1171+
1172+ # ❌ AVOID: Too large batches (memory issues)
1173+ # client.batch_insert("table", million_rows) # May cause OOM
1174+
1175+ # ❌ AVOID: Too small batches (performance issues)
1176+ # for row in data:
1177+ # client.insert("table", row) # Very slow!
1178+
8961179 Error Handling Best Practices
8971180------------------------------
8981181
0 commit comments