Skip to content

Commit a44d3e3

Browse files
committed
Updated indexing templates. convert_template_to_job and transfer fields updated.
1 parent 3f95184 commit a44d3e3

File tree

4 files changed

+243
-450
lines changed

4 files changed

+243
-450
lines changed

config_generation/db_to_xml.py

Lines changed: 2 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,11 @@ def convert_template_to_scraper(self, collection) -> None:
148148
scraper_config = self.update_config_xml()
149149
return scraper_config
150150

151-
def convert_template_to_job(self, collection, job_type) -> None:
151+
def convert_template_to_job(self, collection, job_source) -> None:
152152
"""
153153
assuming this class has been instantiated with the job_template.xml
154154
"""
155-
if job_type == "scrapers":
156-
self.update_or_add_element_value("Collection", f"/scrapers/{collection.config_folder}/")
157-
elif job_type == "indexer":
158-
self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
155+
self.update_or_add_element_value("Collection", f"/{job_source}/{collection.config_folder}/")
159156
job_config = self.update_config_xml()
160157
return job_config
161158

@@ -165,164 +162,10 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
165162
"""
166163

167164
transfer_fields = [
168-
"connector",
169-
"description",
170-
"identity",
171-
"indexers",
172-
"index",
173-
"domain",
174-
"treeRoot",
175-
"Revision",
176-
"visibility",
177-
"ForceReindexation",
178-
"WorkerCount",
179-
"MaxWorkerPerHost",
180-
"EnableNeuralIndexing",
181-
"NeuralSearchSelectionQuery",
182-
"MaxLevel",
183-
"MaxToIndex",
184-
"MaxToCrawl",
185-
"MaxRedirection",
186-
"CrawlMaxSize",
187-
"CrawlTimeout",
188-
"NormalizeUrls",
189-
"CorrectDomainCookies",
190-
"IgnoreSessionCookies",
191-
"DownloadCss",
192-
"DownloadFtp",
193-
"DownloadFile",
194-
"IndexJs",
195-
"FollowJs",
196-
"CrawlFlash",
197-
"IndexEmptyPages",
198-
"CrawlWebsphereSeedlist",
199-
"KeepHashFragmentInUrl",
200-
"RetryCount",
201-
"RetryPause",
202-
"HttpCodesToRetry",
203-
"UseIfModifiedSince",
204-
"UseIfNoneMatch",
205-
"AcceptWeakETag",
206-
"ForcedEncoding",
207-
"UseCompression",
208-
"UseUnsafeHeaderParsing",
209-
"NormalizeSecureSchemesWhenTestingVisited",
210-
"ExactDeduplication",
211-
"NearDeduplication",
212-
"CrawlPauseDelay",
213-
"CrawlPauseCount",
214-
"UseRuntimeAutoRedirect",
215-
"RememberDnsFailure",
216-
"RememberConnectFailure",
217-
"RememberTrustFailure",
218-
"RememberProxyNameResolutionFailure",
219-
"UseRobotsNoFollow",
220-
"UseRobotsTxt",
221-
"RobotsTxtCaseSensitive",
222-
"LoadRobotsTxtSitemapUrls",
223-
"CheckSitemapUrlLastmodInRealtimeMode",
224-
"AddRobotsTxtAllowUrlsToSeedList",
225-
"UseCanonicalLinks",
226-
"UseRelNoFollow",
227-
"DownloadSelectionQuery",
228-
"FollowSelectionQuery",
229-
"IndexSelectionQuery",
230-
"LoadDefaultTags",
231-
"LoadDefaultJsTransforms",
232-
"DisplayLongProperties",
233-
"LongPropertyLimit",
234-
"UsePerformanceMetrics",
235-
"LogPerformanceMetricsPeriodically",
236-
"LogPerformanceMetricsPeriod",
237-
"PasswordRepository",
238-
"StoreDocumentCache",
239-
"AuditEnabled",
240-
"SaveDeniedDocs",
241-
"SavePropertiesToRegistry",
242-
"CollectionStateNative",
243-
"XPathNavigatorNative",
244-
"StatusMaxOk",
245-
"DelApiSecret",
246-
"RealTimeIncrementalState",
247-
"RealTimeInfoOnError",
248-
"ConversionProxies",
249-
"ConversionPlan",
250-
"AddBaseHref",
251-
"AddMetaContentType",
252165
"Throttle",
253-
"DocumentClass",
254-
"ConnectorLanguage",
255-
"ClearHttpRequestCanonicalizeAsFilePath",
256-
"IndexZipContent",
257-
"IndexPdfAttachments",
258-
"IndexOleAttachments",
259-
"IndexMsgContent",
260-
"IndexMsgAttachments",
261-
"IndexOftContent",
262-
"IndexOftAttachments",
263-
"IndexEmlContent",
264-
"IndexEmlAttachments",
265-
"IndexPstContent",
266-
"IndexOstContent",
267-
"IndexPstMsg",
268-
"IndexPstMsgAttachments",
269-
"IndexPstContact",
270-
"IndexPstCalendar",
271-
"IndexPstNote",
272-
"IndexPstTask",
273-
"IndexPstDocument",
274-
"PstUseSafeId",
275-
"IndexArchivesExtensions",
276-
"ArchiveItemsUseArchiveVersion",
277-
"UseShortAttachmentId",
278-
"UseExtendedExtensionGuesser",
279-
"AlwaysScanContainerFiles",
280-
"XmpExtensions",
281-
"MediaExtensions",
282-
"ExiftoolExtensions",
283-
"EarlySelectionQuery",
284-
"SelectionQuery",
285-
"AttachmentSelectionQuery",
286-
"ArchiveItemSelectionQuery",
287-
"EngineConnectionWait",
288-
"FetchCollectionDataDirectlyFromEngine",
289-
"CalculateGraphBoost",
290-
"GraphBoostColumn",
291-
"GraphBoostEMColumn",
292-
"GraphBoostIterations",
293-
"GraphBoostPower",
294-
"GraphBoostAdd",
295-
"UseFieldPermissions",
296-
"ShardIndexes",
297-
"ShardingStrategy",
298-
"ShardSelections",
299-
"CurationType",
300-
"CurationIdPattern",
301-
"RunIndexMiningInIndexer",
302-
"Namespace",
303166
]
304167

305168
double_transfer_fields = [
306-
("UrlAccess", "UseDefaultCredentials"),
307-
("UrlAccess", "UseDefaultNetworkCredentials"),
308-
("UrlAccess", "User"),
309-
("UrlAccess", "Password"),
310-
("UrlAccess", "Domain"),
311-
("UrlAccess", "UseRfc1945"),
312-
("UrlAccess", "Timeout"),
313-
("UrlAccess", "ChangeConnectionGroupNameOnTimeout"),
314-
("UrlAccess", "AllowAuthenticatedConnectionSharing"),
315-
("UrlAccess", "PreAuthenticate"),
316-
("UrlAccess", "HttpVersion"),
317-
("UrlAccess", "KeepAlive"),
318-
("UrlAccess", "SecurityProtocol"),
319-
("UrlAccess", "UserAgent"),
320-
("UrlAccess", "ClientCertificateFile"),
321-
("UrlAccess", "ClientCertificatePassword"),
322-
("UrlAccess", "ClientCertificateStorage"),
323-
("UrlAccess", "AllowXPathCookies"),
324-
("UrlAccess", "UseHttpClientForWebRequests"),
325-
("UrlAccess", "ThrottleManagerCode"),
326169
("UrlAccess", "UseBrowserForWebRequests"),
327170
("UrlAccess", "BrowserForWebRequestsReadinessThreshold"),
328171
("UrlAccess", "BrowserForWebRequestsInitialDelay"),
@@ -332,68 +175,15 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
332175
("UrlAccess", "BrowserForWebRequestsViewportWidth"),
333176
("UrlAccess", "BrowserForWebRequestsViewportHeight"),
334177
("UrlAccess", "BrowserForWebRequestsAdditionalJavascript"),
335-
("UrlAccess", "WebConnectionPluginName"),
336178
("UrlAccess", "PostLoginUrl"),
337179
("UrlAccess", "PostLoginData"),
338180
("UrlAccess", "GetBeforePostLogin"),
339181
("UrlAccess", "PostLoginAutoRedirect"),
340182
("UrlAccess", "ReLoginCount"),
341183
("UrlAccess", "ReLoginDelay"),
342184
("UrlAccess", "DetectHtmlLoginPattern"),
343-
("UrlAccess", "FtpUser"),
344-
("UrlAccess", "FtpPassword"),
345-
("UrlAccess", "FtpDomain"),
346-
("UrlAccess", "FtpUseBinary"),
347-
("UrlAccess", "FtpUsePassive"),
348-
("UrlAccess", "FtpReadWriteTimeout"),
349-
("UrlAccess", "FtpTimeout"),
350-
("UrlAccess", "FtpEnableSsl"),
351-
("UrlAccess", "FileUser"),
352-
("UrlAccess", "FilePassword"),
353-
("UrlAccess", "FileDomain"),
354-
("UrlAccess", "FileTimeout"),
355-
("UrlAccess", "ProxyAutoDetect"),
356-
("UrlAccess", "ProxyAddress"),
357-
("UrlAccess", "ProxyBypassOnLocal"),
358-
("UrlAccess", "ProxyServer"),
359-
("UrlAccess", "ProxyPort"),
360-
("UrlAccess", "ProxyUseDefaultCredentials"),
361-
("UrlAccess", "ProxyUseDefaultNetworkCredentials"),
362-
("UrlAccess", "ProxyUser"),
363-
("UrlAccess", "ProxyPassword"),
364-
("UrlAccess", "ProxyDomain"),
365-
("IndexerClient", "Simulate"),
366-
("IndexerClient", "SimulateGetCollectionState"),
367-
("IndexerClient", "QueueMaxCount"),
368-
("IndexerClient", "SendingThreadFactor"),
369-
("IndexerClient", "DirectFileAccess"),
370-
("IndexerClient", "UseCompression"),
371-
("IndexerClient", "SessionIsFinishedWait"),
372-
("IndexerClient", "SendTimeout"),
373-
("IndexerClient", "RetryConnectCount"),
374-
("IndexerClient", "RetryConnectDelay"),
375185
("IndexerClient", "RetryTimeout"),
376186
("IndexerClient", "RetrySleep"),
377-
("IndexerClient", "DeactivationTimeout"),
378-
("Indexation", "SimulateLemma"),
379-
("Indexation", "SimulateEngine"),
380-
("Indexation", "SimulateCache"),
381-
("Indexation", "SimulateLemmaMin"),
382-
("Indexation", "SimulateLemmaMax"),
383-
("Indexation", "CollectionStateParallelRowFetch"),
384-
("Indexation", "EngineMetaEnabled"),
385-
("Indexation", "ThumbnailHeight"),
386-
("Indexation", "ThumbnailWidth"),
387-
("Indexation", "ThumbnailSmallTimeout"),
388-
("Indexation", "ThumbnailMediumTimeout"),
389-
("Indexation", "ThumbnailLargeTimeout"),
390-
("Indexation", "SynchThumbnailGen"),
391-
("Indexation", "StoreInCollectionCache"),
392-
("Indexation", "GetFilePropertiesFromConverter"),
393-
("PdfGen", "ConverterType"),
394-
("PdfGen", "TimeoutSmall"),
395-
("PdfGen", "TimeoutMedium"),
396-
("PdfGen", "TimeoutLarge"),
397187
]
398188

399189
triple_transfer_fields = [
@@ -403,10 +193,6 @@ def convert_template_to_indexer(self, scraper_editor) -> None:
403193
("UrlAccess", "BrowserLogin", "ShowDevTools"),
404194
("UrlAccess", "BrowserLogin", "SuccessCondition"),
405195
("UrlAccess", "BrowserLogin", "CookieFilter"),
406-
("UrlAccess", "AmazonS3", "AccessKey"),
407-
("UrlAccess", "AmazonS3", "SecretKey"),
408-
("UrlAccess", "AmazonS3", "RegionEndpoint"),
409-
("UrlAccess", "AmazonS3", "ServiceURL"),
410196
]
411197

412198
for field in transfer_fields:

config_generation/xmls/initial_config_template.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
<indexers></indexers>
77
<index></index>
88
<domain></domain>
9-
<treeRoot>treeroot</treeRoot>
9+
<treeRoot></treeRoot>
1010
<Revision>1</Revision>
1111
<visibility></visibility>
1212
<ForceReindexation>false</ForceReindexation>
1313
<Plugin></Plugin>
1414
<WorkerCount>6</WorkerCount>
1515
<MaxWorkerPerHost></MaxWorkerPerHost>
16-
<Url>url</Url>
16+
<Url></Url>
1717
<UrlList></UrlList>
1818
<DynamicUrlList></DynamicUrlList>
1919
<IncludedExtensions>html;htm;xlsx;xls;xlsm;doc;docx;ppt;pdf</IncludedExtensions>

0 commit comments

Comments
 (0)