@@ -148,14 +148,11 @@ def convert_template_to_scraper(self, collection) -> None:
148
148
scraper_config = self .update_config_xml ()
149
149
return scraper_config
150
150
151
- def convert_template_to_job (self , collection , job_type ) -> None :
151
+ def convert_template_to_job (self , collection , job_source ) -> None :
152
152
"""
153
153
assuming this class has been instantiated with the job_template.xml
154
154
"""
155
- if job_type == "scrapers" :
156
- self .update_or_add_element_value ("Collection" , f"/scrapers/{ collection .config_folder } /" )
157
- elif job_type == "indexer" :
158
- self .update_or_add_element_value ("Collection" , f"/SDE/{ collection .config_folder } /" )
155
+ self .update_or_add_element_value ("Collection" , f"/{ job_source } /{ collection .config_folder } /" )
159
156
job_config = self .update_config_xml ()
160
157
return job_config
161
158
@@ -165,164 +162,10 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
165
162
"""
166
163
167
164
transfer_fields = [
168
- "connector" ,
169
- "description" ,
170
- "identity" ,
171
- "indexers" ,
172
- "index" ,
173
- "domain" ,
174
- "treeRoot" ,
175
- "Revision" ,
176
- "visibility" ,
177
- "ForceReindexation" ,
178
- "WorkerCount" ,
179
- "MaxWorkerPerHost" ,
180
- "EnableNeuralIndexing" ,
181
- "NeuralSearchSelectionQuery" ,
182
- "MaxLevel" ,
183
- "MaxToIndex" ,
184
- "MaxToCrawl" ,
185
- "MaxRedirection" ,
186
- "CrawlMaxSize" ,
187
- "CrawlTimeout" ,
188
- "NormalizeUrls" ,
189
- "CorrectDomainCookies" ,
190
- "IgnoreSessionCookies" ,
191
- "DownloadCss" ,
192
- "DownloadFtp" ,
193
- "DownloadFile" ,
194
- "IndexJs" ,
195
- "FollowJs" ,
196
- "CrawlFlash" ,
197
- "IndexEmptyPages" ,
198
- "CrawlWebsphereSeedlist" ,
199
- "KeepHashFragmentInUrl" ,
200
- "RetryCount" ,
201
- "RetryPause" ,
202
- "HttpCodesToRetry" ,
203
- "UseIfModifiedSince" ,
204
- "UseIfNoneMatch" ,
205
- "AcceptWeakETag" ,
206
- "ForcedEncoding" ,
207
- "UseCompression" ,
208
- "UseUnsafeHeaderParsing" ,
209
- "NormalizeSecureSchemesWhenTestingVisited" ,
210
- "ExactDeduplication" ,
211
- "NearDeduplication" ,
212
- "CrawlPauseDelay" ,
213
- "CrawlPauseCount" ,
214
- "UseRuntimeAutoRedirect" ,
215
- "RememberDnsFailure" ,
216
- "RememberConnectFailure" ,
217
- "RememberTrustFailure" ,
218
- "RememberProxyNameResolutionFailure" ,
219
- "UseRobotsNoFollow" ,
220
- "UseRobotsTxt" ,
221
- "RobotsTxtCaseSensitive" ,
222
- "LoadRobotsTxtSitemapUrls" ,
223
- "CheckSitemapUrlLastmodInRealtimeMode" ,
224
- "AddRobotsTxtAllowUrlsToSeedList" ,
225
- "UseCanonicalLinks" ,
226
- "UseRelNoFollow" ,
227
- "DownloadSelectionQuery" ,
228
- "FollowSelectionQuery" ,
229
- "IndexSelectionQuery" ,
230
- "LoadDefaultTags" ,
231
- "LoadDefaultJsTransforms" ,
232
- "DisplayLongProperties" ,
233
- "LongPropertyLimit" ,
234
- "UsePerformanceMetrics" ,
235
- "LogPerformanceMetricsPeriodically" ,
236
- "LogPerformanceMetricsPeriod" ,
237
- "PasswordRepository" ,
238
- "StoreDocumentCache" ,
239
- "AuditEnabled" ,
240
- "SaveDeniedDocs" ,
241
- "SavePropertiesToRegistry" ,
242
- "CollectionStateNative" ,
243
- "XPathNavigatorNative" ,
244
- "StatusMaxOk" ,
245
- "DelApiSecret" ,
246
- "RealTimeIncrementalState" ,
247
- "RealTimeInfoOnError" ,
248
- "ConversionProxies" ,
249
- "ConversionPlan" ,
250
- "AddBaseHref" ,
251
- "AddMetaContentType" ,
252
165
"Throttle" ,
253
- "DocumentClass" ,
254
- "ConnectorLanguage" ,
255
- "ClearHttpRequestCanonicalizeAsFilePath" ,
256
- "IndexZipContent" ,
257
- "IndexPdfAttachments" ,
258
- "IndexOleAttachments" ,
259
- "IndexMsgContent" ,
260
- "IndexMsgAttachments" ,
261
- "IndexOftContent" ,
262
- "IndexOftAttachments" ,
263
- "IndexEmlContent" ,
264
- "IndexEmlAttachments" ,
265
- "IndexPstContent" ,
266
- "IndexOstContent" ,
267
- "IndexPstMsg" ,
268
- "IndexPstMsgAttachments" ,
269
- "IndexPstContact" ,
270
- "IndexPstCalendar" ,
271
- "IndexPstNote" ,
272
- "IndexPstTask" ,
273
- "IndexPstDocument" ,
274
- "PstUseSafeId" ,
275
- "IndexArchivesExtensions" ,
276
- "ArchiveItemsUseArchiveVersion" ,
277
- "UseShortAttachmentId" ,
278
- "UseExtendedExtensionGuesser" ,
279
- "AlwaysScanContainerFiles" ,
280
- "XmpExtensions" ,
281
- "MediaExtensions" ,
282
- "ExiftoolExtensions" ,
283
- "EarlySelectionQuery" ,
284
- "SelectionQuery" ,
285
- "AttachmentSelectionQuery" ,
286
- "ArchiveItemSelectionQuery" ,
287
- "EngineConnectionWait" ,
288
- "FetchCollectionDataDirectlyFromEngine" ,
289
- "CalculateGraphBoost" ,
290
- "GraphBoostColumn" ,
291
- "GraphBoostEMColumn" ,
292
- "GraphBoostIterations" ,
293
- "GraphBoostPower" ,
294
- "GraphBoostAdd" ,
295
- "UseFieldPermissions" ,
296
- "ShardIndexes" ,
297
- "ShardingStrategy" ,
298
- "ShardSelections" ,
299
- "CurationType" ,
300
- "CurationIdPattern" ,
301
- "RunIndexMiningInIndexer" ,
302
- "Namespace" ,
303
166
]
304
167
305
168
double_transfer_fields = [
306
- ("UrlAccess" , "UseDefaultCredentials" ),
307
- ("UrlAccess" , "UseDefaultNetworkCredentials" ),
308
- ("UrlAccess" , "User" ),
309
- ("UrlAccess" , "Password" ),
310
- ("UrlAccess" , "Domain" ),
311
- ("UrlAccess" , "UseRfc1945" ),
312
- ("UrlAccess" , "Timeout" ),
313
- ("UrlAccess" , "ChangeConnectionGroupNameOnTimeout" ),
314
- ("UrlAccess" , "AllowAuthenticatedConnectionSharing" ),
315
- ("UrlAccess" , "PreAuthenticate" ),
316
- ("UrlAccess" , "HttpVersion" ),
317
- ("UrlAccess" , "KeepAlive" ),
318
- ("UrlAccess" , "SecurityProtocol" ),
319
- ("UrlAccess" , "UserAgent" ),
320
- ("UrlAccess" , "ClientCertificateFile" ),
321
- ("UrlAccess" , "ClientCertificatePassword" ),
322
- ("UrlAccess" , "ClientCertificateStorage" ),
323
- ("UrlAccess" , "AllowXPathCookies" ),
324
- ("UrlAccess" , "UseHttpClientForWebRequests" ),
325
- ("UrlAccess" , "ThrottleManagerCode" ),
326
169
("UrlAccess" , "UseBrowserForWebRequests" ),
327
170
("UrlAccess" , "BrowserForWebRequestsReadinessThreshold" ),
328
171
("UrlAccess" , "BrowserForWebRequestsInitialDelay" ),
@@ -332,68 +175,15 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
332
175
("UrlAccess" , "BrowserForWebRequestsViewportWidth" ),
333
176
("UrlAccess" , "BrowserForWebRequestsViewportHeight" ),
334
177
("UrlAccess" , "BrowserForWebRequestsAdditionalJavascript" ),
335
- ("UrlAccess" , "WebConnectionPluginName" ),
336
178
("UrlAccess" , "PostLoginUrl" ),
337
179
("UrlAccess" , "PostLoginData" ),
338
180
("UrlAccess" , "GetBeforePostLogin" ),
339
181
("UrlAccess" , "PostLoginAutoRedirect" ),
340
182
("UrlAccess" , "ReLoginCount" ),
341
183
("UrlAccess" , "ReLoginDelay" ),
342
184
("UrlAccess" , "DetectHtmlLoginPattern" ),
343
- ("UrlAccess" , "FtpUser" ),
344
- ("UrlAccess" , "FtpPassword" ),
345
- ("UrlAccess" , "FtpDomain" ),
346
- ("UrlAccess" , "FtpUseBinary" ),
347
- ("UrlAccess" , "FtpUsePassive" ),
348
- ("UrlAccess" , "FtpReadWriteTimeout" ),
349
- ("UrlAccess" , "FtpTimeout" ),
350
- ("UrlAccess" , "FtpEnableSsl" ),
351
- ("UrlAccess" , "FileUser" ),
352
- ("UrlAccess" , "FilePassword" ),
353
- ("UrlAccess" , "FileDomain" ),
354
- ("UrlAccess" , "FileTimeout" ),
355
- ("UrlAccess" , "ProxyAutoDetect" ),
356
- ("UrlAccess" , "ProxyAddress" ),
357
- ("UrlAccess" , "ProxyBypassOnLocal" ),
358
- ("UrlAccess" , "ProxyServer" ),
359
- ("UrlAccess" , "ProxyPort" ),
360
- ("UrlAccess" , "ProxyUseDefaultCredentials" ),
361
- ("UrlAccess" , "ProxyUseDefaultNetworkCredentials" ),
362
- ("UrlAccess" , "ProxyUser" ),
363
- ("UrlAccess" , "ProxyPassword" ),
364
- ("UrlAccess" , "ProxyDomain" ),
365
- ("IndexerClient" , "Simulate" ),
366
- ("IndexerClient" , "SimulateGetCollectionState" ),
367
- ("IndexerClient" , "QueueMaxCount" ),
368
- ("IndexerClient" , "SendingThreadFactor" ),
369
- ("IndexerClient" , "DirectFileAccess" ),
370
- ("IndexerClient" , "UseCompression" ),
371
- ("IndexerClient" , "SessionIsFinishedWait" ),
372
- ("IndexerClient" , "SendTimeout" ),
373
- ("IndexerClient" , "RetryConnectCount" ),
374
- ("IndexerClient" , "RetryConnectDelay" ),
375
185
("IndexerClient" , "RetryTimeout" ),
376
186
("IndexerClient" , "RetrySleep" ),
377
- ("IndexerClient" , "DeactivationTimeout" ),
378
- ("Indexation" , "SimulateLemma" ),
379
- ("Indexation" , "SimulateEngine" ),
380
- ("Indexation" , "SimulateCache" ),
381
- ("Indexation" , "SimulateLemmaMin" ),
382
- ("Indexation" , "SimulateLemmaMax" ),
383
- ("Indexation" , "CollectionStateParallelRowFetch" ),
384
- ("Indexation" , "EngineMetaEnabled" ),
385
- ("Indexation" , "ThumbnailHeight" ),
386
- ("Indexation" , "ThumbnailWidth" ),
387
- ("Indexation" , "ThumbnailSmallTimeout" ),
388
- ("Indexation" , "ThumbnailMediumTimeout" ),
389
- ("Indexation" , "ThumbnailLargeTimeout" ),
390
- ("Indexation" , "SynchThumbnailGen" ),
391
- ("Indexation" , "StoreInCollectionCache" ),
392
- ("Indexation" , "GetFilePropertiesFromConverter" ),
393
- ("PdfGen" , "ConverterType" ),
394
- ("PdfGen" , "TimeoutSmall" ),
395
- ("PdfGen" , "TimeoutMedium" ),
396
- ("PdfGen" , "TimeoutLarge" ),
397
187
]
398
188
399
189
triple_transfer_fields = [
@@ -403,10 +193,6 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
403
193
("UrlAccess" , "BrowserLogin" , "ShowDevTools" ),
404
194
("UrlAccess" , "BrowserLogin" , "SuccessCondition" ),
405
195
("UrlAccess" , "BrowserLogin" , "CookieFilter" ),
406
- ("UrlAccess" , "AmazonS3" , "AccessKey" ),
407
- ("UrlAccess" , "AmazonS3" , "SecretKey" ),
408
- ("UrlAccess" , "AmazonS3" , "RegionEndpoint" ),
409
- ("UrlAccess" , "AmazonS3" , "ServiceURL" ),
410
196
]
411
197
412
198
for field in transfer_fields :
0 commit comments