Skip to content

Commit 013d720

Browse files
committed
Updated scraper and indexer template creation. Updated scraper and indexer job creation.
1 parent 3f85f26 commit 013d720

File tree

5 files changed

+864
-28
lines changed

5 files changed

+864
-28
lines changed

config_generation/db_to_xml.py

Lines changed: 244 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -148,35 +148,265 @@ def convert_template_to_scraper(self, collection) -> None:
148148
scraper_config = self.update_config_xml()
149149
return scraper_config
150150

151-
def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
151+
def convert_template_to_job(self, collection, job_type) -> None:
152152
"""
153-
assuming this class has been instantiated with the scraper_template.xml
153+
assuming this class has been instantiated with the job_template.xml
154+
"""
155+
if job_type == "scrapers":
156+
self.update_or_add_element_value("Collection", f"/scrapers/{collection.config_folder}/")
157+
elif job_type == "indexer":
158+
self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
159+
job_config = self.update_config_xml()
160+
return job_config
161+
162+
def convert_template_to_indexer(self, scraper_editor) -> None:
163+
"""
164+
assuming this class has been instantiated with the final_config_template.xml
154165
"""
155166

156167
transfer_fields = [
157-
"KeepHashFragmentInUrl",
168+
"connector",
169+
"description",
170+
"identity",
171+
"indexers",
172+
"index",
173+
"domain",
174+
"treeRoot",
175+
"Revision",
176+
"visibility",
177+
"ForceReindexation",
178+
"WorkerCount",
179+
"MaxWorkerPerHost",
180+
"EnableNeuralIndexing",
181+
"NeuralSearchSelectionQuery",
182+
"MaxLevel",
183+
"MaxToIndex",
184+
"MaxToCrawl",
185+
"MaxRedirection",
186+
"CrawlMaxSize",
187+
"CrawlTimeout",
188+
"NormalizeUrls",
158189
"CorrectDomainCookies",
159190
"IgnoreSessionCookies",
160-
"DownloadImages",
161-
"DownloadMedia",
162191
"DownloadCss",
163192
"DownloadFtp",
164193
"DownloadFile",
165194
"IndexJs",
166195
"FollowJs",
167196
"CrawlFlash",
168-
"NormalizeSecureSchemesWhenTestingVisited",
197+
"IndexEmptyPages",
198+
"CrawlWebsphereSeedlist",
199+
"KeepHashFragmentInUrl",
169200
"RetryCount",
170201
"RetryPause",
202+
"HttpCodesToRetry",
203+
"UseIfModifiedSince",
204+
"UseIfNoneMatch",
205+
"AcceptWeakETag",
206+
"ForcedEncoding",
207+
"UseCompression",
208+
"UseUnsafeHeaderParsing",
209+
"NormalizeSecureSchemesWhenTestingVisited",
210+
"ExactDeduplication",
211+
"NearDeduplication",
212+
"CrawlPauseDelay",
213+
"CrawlPauseCount",
214+
"UseRuntimeAutoRedirect",
215+
"RememberDnsFailure",
216+
"RememberConnectFailure",
217+
"RememberTrustFailure",
218+
"RememberProxyNameResolutionFailure",
219+
"UseRobotsNoFollow",
220+
"UseRobotsTxt",
221+
"RobotsTxtCaseSensitive",
222+
"LoadRobotsTxtSitemapUrls",
223+
"CheckSitemapUrlLastmodInRealtimeMode",
224+
"AddRobotsTxtAllowUrlsToSeedList",
225+
"UseCanonicalLinks",
226+
"UseRelNoFollow",
227+
"DownloadSelectionQuery",
228+
"FollowSelectionQuery",
229+
"IndexSelectionQuery",
230+
"LoadDefaultTags",
231+
"LoadDefaultJsTransforms",
232+
"DisplayLongProperties",
233+
"LongPropertyLimit",
234+
"UsePerformanceMetrics",
235+
"LogPerformanceMetricsPeriodically",
236+
"LogPerformanceMetricsPeriod",
237+
"PasswordRepository",
238+
"StoreDocumentCache",
239+
"AuditEnabled",
240+
"SaveDeniedDocs",
241+
"SavePropertiesToRegistry",
242+
"CollectionStateNative",
243+
"XPathNavigatorNative",
244+
"StatusMaxOk",
245+
"DelApiSecret",
246+
"RealTimeIncrementalState",
247+
"RealTimeInfoOnError",
248+
"ConversionProxies",
249+
"ConversionPlan",
171250
"AddBaseHref",
172251
"AddMetaContentType",
173-
"NormalizeUrls",
252+
"Throttle",
253+
"DocumentClass",
254+
"ConnectorLanguage",
255+
"ClearHttpRequestCanonicalizeAsFilePath",
256+
"IndexZipContent",
257+
"IndexPdfAttachments",
258+
"IndexOleAttachments",
259+
"IndexMsgContent",
260+
"IndexMsgAttachments",
261+
"IndexOftContent",
262+
"IndexOftAttachments",
263+
"IndexEmlContent",
264+
"IndexEmlAttachments",
265+
"IndexPstContent",
266+
"IndexOstContent",
267+
"IndexPstMsg",
268+
"IndexPstMsgAttachments",
269+
"IndexPstContact",
270+
"IndexPstCalendar",
271+
"IndexPstNote",
272+
"IndexPstTask",
273+
"IndexPstDocument",
274+
"PstUseSafeId",
275+
"IndexArchivesExtensions",
276+
"ArchiveItemsUseArchiveVersion",
277+
"UseShortAttachmentId",
278+
"UseExtendedExtensionGuesser",
279+
"AlwaysScanContainerFiles",
280+
"XmpExtensions",
281+
"MediaExtensions",
282+
"ExiftoolExtensions",
283+
"EarlySelectionQuery",
284+
"SelectionQuery",
285+
"AttachmentSelectionQuery",
286+
"ArchiveItemSelectionQuery",
287+
"EngineConnectionWait",
288+
"FetchCollectionDataDirectlyFromEngine",
289+
"CalculateGraphBoost",
290+
"GraphBoostColumn",
291+
"GraphBoostEMColumn",
292+
"GraphBoostIterations",
293+
"GraphBoostPower",
294+
"GraphBoostAdd",
295+
"UseFieldPermissions",
296+
"ShardIndexes",
297+
"ShardingStrategy",
298+
"ShardSelections",
299+
"CurationType",
300+
"CurationIdPattern",
301+
"RunIndexMiningInIndexer",
302+
"Namespace",
174303
]
175304

176305
double_transfer_fields = [
306+
("UrlAccess", "UseDefaultCredentials"),
307+
("UrlAccess", "UseDefaultNetworkCredentials"),
308+
("UrlAccess", "User"),
309+
("UrlAccess", "Password"),
310+
("UrlAccess", "Domain"),
311+
("UrlAccess", "UseRfc1945"),
312+
("UrlAccess", "Timeout"),
313+
("UrlAccess", "ChangeConnectionGroupNameOnTimeout"),
314+
("UrlAccess", "AllowAuthenticatedConnectionSharing"),
315+
("UrlAccess", "PreAuthenticate"),
316+
("UrlAccess", "HttpVersion"),
317+
("UrlAccess", "KeepAlive"),
318+
("UrlAccess", "SecurityProtocol"),
319+
("UrlAccess", "UserAgent"),
320+
("UrlAccess", "ClientCertificateFile"),
321+
("UrlAccess", "ClientCertificatePassword"),
322+
("UrlAccess", "ClientCertificateStorage"),
177323
("UrlAccess", "AllowXPathCookies"),
178-
("UrlAccess", "UseBrowserForWebRequests"),
179324
("UrlAccess", "UseHttpClientForWebRequests"),
325+
("UrlAccess", "ThrottleManagerCode"),
326+
("UrlAccess", "UseBrowserForWebRequests"),
327+
("UrlAccess", "BrowserForWebRequestsReadinessThreshold"),
328+
("UrlAccess", "BrowserForWebRequestsInitialDelay"),
329+
("UrlAccess", "BrowserForWebRequestsMaxTotalDelay"),
330+
("UrlAccess", "BrowserForWebRequestsMaxResourcesDelay"),
331+
("UrlAccess", "BrowserForWebRequestsLogLevel"),
332+
("UrlAccess", "BrowserForWebRequestsViewportWidth"),
333+
("UrlAccess", "BrowserForWebRequestsViewportHeight"),
334+
("UrlAccess", "BrowserForWebRequestsAdditionalJavascript"),
335+
("UrlAccess", "WebConnectionPluginName"),
336+
("UrlAccess", "PostLoginUrl"),
337+
("UrlAccess", "PostLoginData"),
338+
("UrlAccess", "GetBeforePostLogin"),
339+
("UrlAccess", "PostLoginAutoRedirect"),
340+
("UrlAccess", "ReLoginCount"),
341+
("UrlAccess", "ReLoginDelay"),
342+
("UrlAccess", "DetectHtmlLoginPattern"),
343+
("UrlAccess", "FtpUser"),
344+
("UrlAccess", "FtpPassword"),
345+
("UrlAccess", "FtpDomain"),
346+
("UrlAccess", "FtpUseBinary"),
347+
("UrlAccess", "FtpUsePassive"),
348+
("UrlAccess", "FtpReadWriteTimeout"),
349+
("UrlAccess", "FtpTimeout"),
350+
("UrlAccess", "FtpEnableSsl"),
351+
("UrlAccess", "FileUser"),
352+
("UrlAccess", "FilePassword"),
353+
("UrlAccess", "FileDomain"),
354+
("UrlAccess", "FileTimeout"),
355+
("UrlAccess", "ProxyAutoDetect"),
356+
("UrlAccess", "ProxyAddress"),
357+
("UrlAccess", "ProxyBypassOnLocal"),
358+
("UrlAccess", "ProxyServer"),
359+
("UrlAccess", "ProxyPort"),
360+
("UrlAccess", "ProxyUseDefaultCredentials"),
361+
("UrlAccess", "ProxyUseDefaultNetworkCredentials"),
362+
("UrlAccess", "ProxyUser"),
363+
("UrlAccess", "ProxyPassword"),
364+
("UrlAccess", "ProxyDomain"),
365+
("IndexerClient", "Simulate"),
366+
("IndexerClient", "SimulateGetCollectionState"),
367+
("IndexerClient", "QueueMaxCount"),
368+
("IndexerClient", "SendingThreadFactor"),
369+
("IndexerClient", "DirectFileAccess"),
370+
("IndexerClient", "UseCompression"),
371+
("IndexerClient", "SessionIsFinishedWait"),
372+
("IndexerClient", "SendTimeout"),
373+
("IndexerClient", "RetryConnectCount"),
374+
("IndexerClient", "RetryConnectDelay"),
375+
("IndexerClient", "RetryTimeout"),
376+
("IndexerClient", "RetrySleep"),
377+
("IndexerClient", "DeactivationTimeout"),
378+
("Indexation", "SimulateLemma"),
379+
("Indexation", "SimulateEngine"),
380+
("Indexation", "SimulateCache"),
381+
("Indexation", "SimulateLemmaMin"),
382+
("Indexation", "SimulateLemmaMax"),
383+
("Indexation", "CollectionStateParallelRowFetch"),
384+
("Indexation", "EngineMetaEnabled"),
385+
("Indexation", "ThumbnailHeight"),
386+
("Indexation", "ThumbnailWidth"),
387+
("Indexation", "ThumbnailSmallTimeout"),
388+
("Indexation", "ThumbnailMediumTimeout"),
389+
("Indexation", "ThumbnailLargeTimeout"),
390+
("Indexation", "SynchThumbnailGen"),
391+
("Indexation", "StoreInCollectionCache"),
392+
("Indexation", "GetFilePropertiesFromConverter"),
393+
("PdfGen", "ConverterType"),
394+
("PdfGen", "TimeoutSmall"),
395+
("PdfGen", "TimeoutMedium"),
396+
("PdfGen", "TimeoutLarge"),
397+
]
398+
399+
triple_transfer_fields = [
400+
("UrlAccess", "BrowserLogin", "Activate"),
401+
("UrlAccess", "BrowserLogin", "RemoteDebuggingPort"),
402+
("UrlAccess", "BrowserLogin", "BrowserLogLevel"),
403+
("UrlAccess", "BrowserLogin", "ShowDevTools"),
404+
("UrlAccess", "BrowserLogin", "SuccessCondition"),
405+
("UrlAccess", "BrowserLogin", "CookieFilter"),
406+
("UrlAccess", "AmazonS3", "AccessKey"),
407+
("UrlAccess", "AmazonS3", "SecretKey"),
408+
("UrlAccess", "AmazonS3", "RegionEndpoint"),
409+
("UrlAccess", "AmazonS3", "ServiceURL"),
180410
]
181411

182412
for field in transfer_fields:
@@ -187,18 +417,15 @@ def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
187417
f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)
188418
)
189419

420+
for grandparent, parent, child in triple_transfer_fields:
421+
self.update_or_add_element_value(
422+
f"{grandparent}/{parent}/{child}",
423+
scraper_editor.get_tag_value(f"{grandparent}/{parent}/{child}", strict=True),
424+
)
425+
190426
scraper_config = self.update_config_xml()
191427
return scraper_config
192428

193-
def convert_template_to_indexer(self, collection) -> None:
194-
"""
195-
assuming this class has been instantiated with the indexer_template.xml
196-
"""
197-
self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
198-
indexer_config = self.update_config_xml()
199-
200-
return indexer_config
201-
202429
def _mapping_exists(self, new_mapping: ET.Element):
203430
"""
204431
Check if the mapping with given parameters already exists in the XML tree

config_generation/generate_jobs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def _create_job_name(self, collection_name):
3939
if source == "SDE":
4040
return f"collection.indexer.{collection_name}.xml"
4141
else:
42-
return f"collection.indexer.{source}.{collection_name}.xml"
42+
return f"collection.{source}.{collection_name}.xml"
4343

4444
def _create_joblist_name(self, index):
4545
"""

0 commit comments

Comments
 (0)