@@ -956,17 +956,12 @@ def GITHUB_APP_CLIENT_ID(self):
956956 # Chunk size for elasticsearch reindex celery tasks
957957 ES_TASK_CHUNK_SIZE = 500
958958
959- # Info from Honza about this:
960- # The key to determine shard number is actually usually not the node count,
961- # but the size of your data.
962- # There are advantages to just having a single shard in an index since
963- # you don't have to do the distribute/collect steps when executing a search.
964- # If your data will allow it (not significantly larger than 40GB)
965- # I would recommend going to a single shard and one replica meaning
966- # any of the two nodes will be able to serve any search without talking to the other one.
967- # Scaling to more searches will then just mean adding a third node
968- # and a second replica resulting in immediate 50% bump in max search throughput.
969-
959+ # The number of shards depends on the size of the data, 30GB per shard is a good rule to follow.
960+ # Everytime we need to do a re-index, make sure to check the size of the index and adjust the
961+ # number of shards if needed (change on ops repos). This is a static setting, it can't be changed
962+ # after the index is created. In case a change is needed, a new index must be created and data
963+ # reindexed. The number of replicas can be changed dynamically, one replica is a good default.
964+ # See https://www.elastic.co/docs/deploy-manage/production-guidance/optimize-performance/size-shards.
970965 ES_INDEXES = {
971966 "project" : {
972967 "name" : "project_index" ,
@@ -981,16 +976,6 @@ def GITHUB_APP_CLIENT_ID(self):
981976 },
982977 }
983978
984- # ANALYZER = 'analysis': {
985- # 'analyzer': {
986- # 'default_icu': {
987- # 'type': 'custom',
988- # 'tokenizer': 'icu_tokenizer',
989- # 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
990- # }
991- # }
992- # }
993-
994979 # Disable auto refresh for increasing index performance
995980 ELASTICSEARCH_DSL_AUTO_REFRESH = False
996981
0 commit comments