@@ -148,35 +148,265 @@ def convert_template_to_scraper(self, collection) -> None:
148
148
scraper_config = self .update_config_xml ()
149
149
return scraper_config
150
150
151
- def convert_template_to_plugin_indexer (self , scraper_editor ) -> None :
151
+ def convert_template_to_job (self , collection , job_type ) -> None :
152
152
"""
153
- assuming this class has been instantiated with the scraper_template.xml
153
+ assuming this class has been instantiated with the job_template.xml
154
+ """
155
+ if job_type == "scrapers" :
156
+ self .update_or_add_element_value ("Collection" , f"/scrapers/{ collection .config_folder } /" )
157
+ elif job_type == "indexer" :
158
+ self .update_or_add_element_value ("Collection" , f"/SDE/{ collection .config_folder } /" )
159
+ job_config = self .update_config_xml ()
160
+ return job_config
161
+
162
+ def convert_template_to_indexer (self , scraper_editor ) -> None :
163
+ """
164
+ assuming this class has been instantiated with the final_config_template.xml
154
165
"""
155
166
156
167
transfer_fields = [
157
- "KeepHashFragmentInUrl" ,
168
+ "connector" ,
169
+ "description" ,
170
+ "identity" ,
171
+ "indexers" ,
172
+ "index" ,
173
+ "domain" ,
174
+ "treeRoot" ,
175
+ "Revision" ,
176
+ "visibility" ,
177
+ "ForceReindexation" ,
178
+ "WorkerCount" ,
179
+ "MaxWorkerPerHost" ,
180
+ "EnableNeuralIndexing" ,
181
+ "NeuralSearchSelectionQuery" ,
182
+ "MaxLevel" ,
183
+ "MaxToIndex" ,
184
+ "MaxToCrawl" ,
185
+ "MaxRedirection" ,
186
+ "CrawlMaxSize" ,
187
+ "CrawlTimeout" ,
188
+ "NormalizeUrls" ,
158
189
"CorrectDomainCookies" ,
159
190
"IgnoreSessionCookies" ,
160
- "DownloadImages" ,
161
- "DownloadMedia" ,
162
191
"DownloadCss" ,
163
192
"DownloadFtp" ,
164
193
"DownloadFile" ,
165
194
"IndexJs" ,
166
195
"FollowJs" ,
167
196
"CrawlFlash" ,
168
- "NormalizeSecureSchemesWhenTestingVisited" ,
197
+ "IndexEmptyPages" ,
198
+ "CrawlWebsphereSeedlist" ,
199
+ "KeepHashFragmentInUrl" ,
169
200
"RetryCount" ,
170
201
"RetryPause" ,
202
+ "HttpCodesToRetry" ,
203
+ "UseIfModifiedSince" ,
204
+ "UseIfNoneMatch" ,
205
+ "AcceptWeakETag" ,
206
+ "ForcedEncoding" ,
207
+ "UseCompression" ,
208
+ "UseUnsafeHeaderParsing" ,
209
+ "NormalizeSecureSchemesWhenTestingVisited" ,
210
+ "ExactDeduplication" ,
211
+ "NearDeduplication" ,
212
+ "CrawlPauseDelay" ,
213
+ "CrawlPauseCount" ,
214
+ "UseRuntimeAutoRedirect" ,
215
+ "RememberDnsFailure" ,
216
+ "RememberConnectFailure" ,
217
+ "RememberTrustFailure" ,
218
+ "RememberProxyNameResolutionFailure" ,
219
+ "UseRobotsNoFollow" ,
220
+ "UseRobotsTxt" ,
221
+ "RobotsTxtCaseSensitive" ,
222
+ "LoadRobotsTxtSitemapUrls" ,
223
+ "CheckSitemapUrlLastmodInRealtimeMode" ,
224
+ "AddRobotsTxtAllowUrlsToSeedList" ,
225
+ "UseCanonicalLinks" ,
226
+ "UseRelNoFollow" ,
227
+ "DownloadSelectionQuery" ,
228
+ "FollowSelectionQuery" ,
229
+ "IndexSelectionQuery" ,
230
+ "LoadDefaultTags" ,
231
+ "LoadDefaultJsTransforms" ,
232
+ "DisplayLongProperties" ,
233
+ "LongPropertyLimit" ,
234
+ "UsePerformanceMetrics" ,
235
+ "LogPerformanceMetricsPeriodically" ,
236
+ "LogPerformanceMetricsPeriod" ,
237
+ "PasswordRepository" ,
238
+ "StoreDocumentCache" ,
239
+ "AuditEnabled" ,
240
+ "SaveDeniedDocs" ,
241
+ "SavePropertiesToRegistry" ,
242
+ "CollectionStateNative" ,
243
+ "XPathNavigatorNative" ,
244
+ "StatusMaxOk" ,
245
+ "DelApiSecret" ,
246
+ "RealTimeIncrementalState" ,
247
+ "RealTimeInfoOnError" ,
248
+ "ConversionProxies" ,
249
+ "ConversionPlan" ,
171
250
"AddBaseHref" ,
172
251
"AddMetaContentType" ,
173
- "NormalizeUrls" ,
252
+ "Throttle" ,
253
+ "DocumentClass" ,
254
+ "ConnectorLanguage" ,
255
+ "ClearHttpRequestCanonicalizeAsFilePath" ,
256
+ "IndexZipContent" ,
257
+ "IndexPdfAttachments" ,
258
+ "IndexOleAttachments" ,
259
+ "IndexMsgContent" ,
260
+ "IndexMsgAttachments" ,
261
+ "IndexOftContent" ,
262
+ "IndexOftAttachments" ,
263
+ "IndexEmlContent" ,
264
+ "IndexEmlAttachments" ,
265
+ "IndexPstContent" ,
266
+ "IndexOstContent" ,
267
+ "IndexPstMsg" ,
268
+ "IndexPstMsgAttachments" ,
269
+ "IndexPstContact" ,
270
+ "IndexPstCalendar" ,
271
+ "IndexPstNote" ,
272
+ "IndexPstTask" ,
273
+ "IndexPstDocument" ,
274
+ "PstUseSafeId" ,
275
+ "IndexArchivesExtensions" ,
276
+ "ArchiveItemsUseArchiveVersion" ,
277
+ "UseShortAttachmentId" ,
278
+ "UseExtendedExtensionGuesser" ,
279
+ "AlwaysScanContainerFiles" ,
280
+ "XmpExtensions" ,
281
+ "MediaExtensions" ,
282
+ "ExiftoolExtensions" ,
283
+ "EarlySelectionQuery" ,
284
+ "SelectionQuery" ,
285
+ "AttachmentSelectionQuery" ,
286
+ "ArchiveItemSelectionQuery" ,
287
+ "EngineConnectionWait" ,
288
+ "FetchCollectionDataDirectlyFromEngine" ,
289
+ "CalculateGraphBoost" ,
290
+ "GraphBoostColumn" ,
291
+ "GraphBoostEMColumn" ,
292
+ "GraphBoostIterations" ,
293
+ "GraphBoostPower" ,
294
+ "GraphBoostAdd" ,
295
+ "UseFieldPermissions" ,
296
+ "ShardIndexes" ,
297
+ "ShardingStrategy" ,
298
+ "ShardSelections" ,
299
+ "CurationType" ,
300
+ "CurationIdPattern" ,
301
+ "RunIndexMiningInIndexer" ,
302
+ "Namespace" ,
174
303
]
175
304
176
305
double_transfer_fields = [
306
+ ("UrlAccess" , "UseDefaultCredentials" ),
307
+ ("UrlAccess" , "UseDefaultNetworkCredentials" ),
308
+ ("UrlAccess" , "User" ),
309
+ ("UrlAccess" , "Password" ),
310
+ ("UrlAccess" , "Domain" ),
311
+ ("UrlAccess" , "UseRfc1945" ),
312
+ ("UrlAccess" , "Timeout" ),
313
+ ("UrlAccess" , "ChangeConnectionGroupNameOnTimeout" ),
314
+ ("UrlAccess" , "AllowAuthenticatedConnectionSharing" ),
315
+ ("UrlAccess" , "PreAuthenticate" ),
316
+ ("UrlAccess" , "HttpVersion" ),
317
+ ("UrlAccess" , "KeepAlive" ),
318
+ ("UrlAccess" , "SecurityProtocol" ),
319
+ ("UrlAccess" , "UserAgent" ),
320
+ ("UrlAccess" , "ClientCertificateFile" ),
321
+ ("UrlAccess" , "ClientCertificatePassword" ),
322
+ ("UrlAccess" , "ClientCertificateStorage" ),
177
323
("UrlAccess" , "AllowXPathCookies" ),
178
- ("UrlAccess" , "UseBrowserForWebRequests" ),
179
324
("UrlAccess" , "UseHttpClientForWebRequests" ),
325
+ ("UrlAccess" , "ThrottleManagerCode" ),
326
+ ("UrlAccess" , "UseBrowserForWebRequests" ),
327
+ ("UrlAccess" , "BrowserForWebRequestsReadinessThreshold" ),
328
+ ("UrlAccess" , "BrowserForWebRequestsInitialDelay" ),
329
+ ("UrlAccess" , "BrowserForWebRequestsMaxTotalDelay" ),
330
+ ("UrlAccess" , "BrowserForWebRequestsMaxResourcesDelay" ),
331
+ ("UrlAccess" , "BrowserForWebRequestsLogLevel" ),
332
+ ("UrlAccess" , "BrowserForWebRequestsViewportWidth" ),
333
+ ("UrlAccess" , "BrowserForWebRequestsViewportHeight" ),
334
+ ("UrlAccess" , "BrowserForWebRequestsAdditionalJavascript" ),
335
+ ("UrlAccess" , "WebConnectionPluginName" ),
336
+ ("UrlAccess" , "PostLoginUrl" ),
337
+ ("UrlAccess" , "PostLoginData" ),
338
+ ("UrlAccess" , "GetBeforePostLogin" ),
339
+ ("UrlAccess" , "PostLoginAutoRedirect" ),
340
+ ("UrlAccess" , "ReLoginCount" ),
341
+ ("UrlAccess" , "ReLoginDelay" ),
342
+ ("UrlAccess" , "DetectHtmlLoginPattern" ),
343
+ ("UrlAccess" , "FtpUser" ),
344
+ ("UrlAccess" , "FtpPassword" ),
345
+ ("UrlAccess" , "FtpDomain" ),
346
+ ("UrlAccess" , "FtpUseBinary" ),
347
+ ("UrlAccess" , "FtpUsePassive" ),
348
+ ("UrlAccess" , "FtpReadWriteTimeout" ),
349
+ ("UrlAccess" , "FtpTimeout" ),
350
+ ("UrlAccess" , "FtpEnableSsl" ),
351
+ ("UrlAccess" , "FileUser" ),
352
+ ("UrlAccess" , "FilePassword" ),
353
+ ("UrlAccess" , "FileDomain" ),
354
+ ("UrlAccess" , "FileTimeout" ),
355
+ ("UrlAccess" , "ProxyAutoDetect" ),
356
+ ("UrlAccess" , "ProxyAddress" ),
357
+ ("UrlAccess" , "ProxyBypassOnLocal" ),
358
+ ("UrlAccess" , "ProxyServer" ),
359
+ ("UrlAccess" , "ProxyPort" ),
360
+ ("UrlAccess" , "ProxyUseDefaultCredentials" ),
361
+ ("UrlAccess" , "ProxyUseDefaultNetworkCredentials" ),
362
+ ("UrlAccess" , "ProxyUser" ),
363
+ ("UrlAccess" , "ProxyPassword" ),
364
+ ("UrlAccess" , "ProxyDomain" ),
365
+ ("IndexerClient" , "Simulate" ),
366
+ ("IndexerClient" , "SimulateGetCollectionState" ),
367
+ ("IndexerClient" , "QueueMaxCount" ),
368
+ ("IndexerClient" , "SendingThreadFactor" ),
369
+ ("IndexerClient" , "DirectFileAccess" ),
370
+ ("IndexerClient" , "UseCompression" ),
371
+ ("IndexerClient" , "SessionIsFinishedWait" ),
372
+ ("IndexerClient" , "SendTimeout" ),
373
+ ("IndexerClient" , "RetryConnectCount" ),
374
+ ("IndexerClient" , "RetryConnectDelay" ),
375
+ ("IndexerClient" , "RetryTimeout" ),
376
+ ("IndexerClient" , "RetrySleep" ),
377
+ ("IndexerClient" , "DeactivationTimeout" ),
378
+ ("Indexation" , "SimulateLemma" ),
379
+ ("Indexation" , "SimulateEngine" ),
380
+ ("Indexation" , "SimulateCache" ),
381
+ ("Indexation" , "SimulateLemmaMin" ),
382
+ ("Indexation" , "SimulateLemmaMax" ),
383
+ ("Indexation" , "CollectionStateParallelRowFetch" ),
384
+ ("Indexation" , "EngineMetaEnabled" ),
385
+ ("Indexation" , "ThumbnailHeight" ),
386
+ ("Indexation" , "ThumbnailWidth" ),
387
+ ("Indexation" , "ThumbnailSmallTimeout" ),
388
+ ("Indexation" , "ThumbnailMediumTimeout" ),
389
+ ("Indexation" , "ThumbnailLargeTimeout" ),
390
+ ("Indexation" , "SynchThumbnailGen" ),
391
+ ("Indexation" , "StoreInCollectionCache" ),
392
+ ("Indexation" , "GetFilePropertiesFromConverter" ),
393
+ ("PdfGen" , "ConverterType" ),
394
+ ("PdfGen" , "TimeoutSmall" ),
395
+ ("PdfGen" , "TimeoutMedium" ),
396
+ ("PdfGen" , "TimeoutLarge" ),
397
+ ]
398
+
399
+ triple_transfer_fields = [
400
+ ("UrlAccess" , "BrowserLogin" , "Activate" ),
401
+ ("UrlAccess" , "BrowserLogin" , "RemoteDebuggingPort" ),
402
+ ("UrlAccess" , "BrowserLogin" , "BrowserLogLevel" ),
403
+ ("UrlAccess" , "BrowserLogin" , "ShowDevTools" ),
404
+ ("UrlAccess" , "BrowserLogin" , "SuccessCondition" ),
405
+ ("UrlAccess" , "BrowserLogin" , "CookieFilter" ),
406
+ ("UrlAccess" , "AmazonS3" , "AccessKey" ),
407
+ ("UrlAccess" , "AmazonS3" , "SecretKey" ),
408
+ ("UrlAccess" , "AmazonS3" , "RegionEndpoint" ),
409
+ ("UrlAccess" , "AmazonS3" , "ServiceURL" ),
180
410
]
181
411
182
412
for field in transfer_fields :
@@ -187,18 +417,15 @@ def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
187
417
f"{ parent } /{ child } " , scraper_editor .get_tag_value (f"{ parent } /{ child } " , strict = True )
188
418
)
189
419
420
+ for grandparent , parent , child in triple_transfer_fields :
421
+ self .update_or_add_element_value (
422
+ f"{ grandparent } /{ parent } /{ child } " ,
423
+ scraper_editor .get_tag_value (f"{ grandparent } /{ parent } /{ child } " , strict = True ),
424
+ )
425
+
190
426
scraper_config = self .update_config_xml ()
191
427
return scraper_config
192
428
193
- def convert_template_to_indexer (self , collection ) -> None :
194
- """
195
- assuming this class has been instantiated with the indexer_template.xml
196
- """
197
- self .update_or_add_element_value ("Collection" , f"/SDE/{ collection .config_folder } /" )
198
- indexer_config = self .update_config_xml ()
199
-
200
- return indexer_config
201
-
202
429
def _mapping_exists (self , new_mapping : ET .Element ):
203
430
"""
204
431
Check if the mapping with given parameters already exists in the XML tree
0 commit comments