Skip to content

Commit fcb40fb

Browse files
authored
Merge pull request #927 from NASA-IMPACT/926-improve-indexing-scripts
926 improve indexing scripts
2 parents fb0ded0 + c6bec35 commit fcb40fb

File tree

6 files changed

+370
-113
lines changed

6 files changed

+370
-113
lines changed
Lines changed: 34 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,49 @@
1-
# rename this file to config.py and do not track it on git
2-
# example on how to get the token are in the README.md
3-
# go down to the bottom to the section called this is the stuff you want to change
1+
from sources_to_scrape import (
2+
sources_to_index_test_grid_20240809,
3+
)
44

5-
from sources_to_scrape import sources_to_delete_20231108
6-
7-
# API Config
85
tokens: dict[str, str] = {
96
"test_server": "token here",
107
"ren_server": "token here",
118
}
129

1310
AVAILABLE_INDEXERS_TEST = [
14-
"NodeIndexer1/identity0",
15-
"NodeWebapp1/identity0",
16-
"NodeWebapp2/identity0",
17-
"NodeWebapp3/identity0",
11+
"IndexerServerA/identity0",
12+
"IndexerServerB/identity0",
1813
]
1914

2015
AVAILABLE_INDEXERS_PROD = ["NodeINDEX1/identity0", "NodeINDEX2/identity0"]
2116

2217
TEST_SERVER_INDEXES = [ # this is the test server list
23-
"HELIO_Repository_1",
24-
"SDE_Acronyms",
25-
"SMD_ASTRO_Repository_1",
26-
"SMD_ASTRO_Repository_2",
27-
"SMD_EARTHSCIENCE_Repository_1",
28-
"SMD_GENELAB_Repository_1",
29-
"SMD_PLANETARY_Repository_1",
30-
"SMD_PLANETARY_Repository_2",
18+
# "sde_neural_test_index",
19+
"sde_index"
3120
]
3221

3322
PROD_SERVER_INDEXES = [
3423
# "EDP_Audit_1",
35-
"SMD_LSDA_Repository_1",
36-
# "EDP_UserMetadata_1",
37-
"SMD_NTRS_Repository_1",
38-
"GCMD_Repository_1",
39-
"SMD_PLANETARY_Repository_1",
40-
# "GCMD_Repository_1_Metadata",
41-
"SMD_PLANETARY_Repository_2",
42-
"GCMD_Repository_2",
43-
"STI_Repository_1",
44-
# "GCMD_Repository_3_Metadata",
45-
# "STI_Repository_1_Metadata",
46-
"HELIO_Repository_1",
47-
"STI_Repository_2",
24+
# "SMD_LSDA_Repository_1",
25+
# # "EDP_UserMetadata_1",
26+
# "SMD_NTRS_Repository_1",
27+
# "GCMD_Repository_1",
28+
# "SMD_PLANETARY_Repository_1",
29+
# # "GCMD_Repository_1_Metadata",
30+
# "SMD_PLANETARY_Repository_2",
31+
# "GCMD_Repository_2",
32+
# "STI_Repository_1",
33+
# # "GCMD_Repository_3_Metadata",
34+
# # "STI_Repository_1_Metadata",
35+
# "HELIO_Repository_1",
36+
# "STI_Repository_2",
4837
"SDE_Index",
4938
# "STI_Repository_2_Metadata",
50-
"SMD_ASTRO_Repository_1",
51-
"STI_Repository_3",
52-
"SMD_ASTRO_Repository_2",
53-
"STI_Repository_4",
54-
"SMD_EARTHSCIENCE_Repository_1",
55-
# "SinequaDoc",
56-
"SMD_GENELAB_Repository_1",
57-
"Test",
39+
# "SMD_ASTRO_Repository_1",
40+
# "STI_Repository_3",
41+
# "SMD_ASTRO_Repository_2",
42+
# "STI_Repository_4",
43+
# "SMD_EARTHSCIENCE_Repository_1",
44+
# # "SinequaDoc",
45+
# "SMD_GENELAB_Repository_1",
46+
# "Test",
5847
]
5948

6049

@@ -69,12 +58,14 @@
6958
},
7059
}
7160

72-
# this is the stuff you want to change
73-
collection_list: list[str] = sources_to_delete_20231108 # python list
74-
source = "SMD"
75-
batch_name: str = "delete_everywhere_20231108_test"
61+
# Job Creation Config
62+
collection_list: list[str] = sources_to_index_test_grid_20240809 # python list
63+
date = "20240809"
64+
source = "SDE"
7665
server = "test"
7766

7867
# auto assigned
68+
batch_delete_name: str = f"sources_to_delete_on_{server}_{date}"
69+
batch_index_name: str = f"sources_to_index_on_{server}_{date}"
7970
available_indexers = SERVER_INFO[server]["indexers"]
8071
indexes_to_delete_from = SERVER_INFO[server]["indexes"]

config_generation/delete_server_content.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from db_to_xml_file_based import XmlEditor
44

5-
from config import batch_name, collection_list, indexes_to_delete_from, source
5+
from config import batch_delete_name, collection_list, indexes_to_delete_from, source
66

77
COMMAND_FILES_PATH = "../sinequa_configs/commands/"
88
DELETE_COMMAND_TEMPLATE_PATH = "xmls/delete_template.xml"
@@ -13,5 +13,5 @@
1313
for index in indexes_to_delete_from:
1414
sql = f"delete from {index} where collection='/{source}/{collection}/'"
1515
command_file.update_or_add_element_value(element_name="SQL", element_value=sql, add_duplicate=True)
16-
file_name = f"{COMMAND_FILES_PATH}{batch_name}.xml"
16+
file_name = f"{COMMAND_FILES_PATH}{batch_delete_name}.xml"
1717
command_file._update_config_xml(file_name)

config_generation/generate_jobs.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from db_to_xml_file_based import XmlEditor
99

10-
from config import available_indexers, batch_name, collection_list, source
10+
from config import available_indexers, batch_index_name, collection_list, source
1111

1212

1313
class ParallelJobCreator:
@@ -36,7 +36,10 @@ def _create_job_name(self, collection_name):
3636
this code generates that file name as a string, and it will be passed to the function that
3737
creates the actual job file
3838
"""
39-
return f"collection.indexer.{collection_name}.xml"
39+
if source == "SDE":
40+
return f"collection.indexer.{collection_name}.xml"
41+
else:
42+
return f"collection.indexer.{source}.{collection_name}.xml"
4043

4144
def _create_joblist_name(self, index):
4245
"""
@@ -46,7 +49,7 @@ def _create_joblist_name(self, index):
4649
this code generates that file name as a string, and it will be passed to the function that
4750
creates the actual job file
4851
"""
49-
return f"parallel_indexing_list-{batch_name}-{index}.xml"
52+
return f"parallel_indexing_list-{batch_index_name}-{index}.xml"
5053

5154
def _create_collection_jobs(self):
5255
"""
@@ -89,7 +92,7 @@ def make_all_parallel_jobs(self):
8992
master = XmlEditor(self.joblist_template_path)
9093
master.update_or_add_element_value("RunJobsInParallel", "true")
9194
[master.add_job_list_item(job_name) for job_name in job_names]
92-
master._update_config_xml(f"{self.job_path_root}parallel_indexing_list-{batch_name}-master.xml")
95+
master._update_config_xml(f"{self.job_path_root}parallel_indexing_list-{batch_index_name}-master.xml")
9396

9497

9598
if __name__ == "__main__":

0 commit comments

Comments
 (0)