@@ -36,13 +36,13 @@ def test_indexing_finished_triggers_full_text_fetch(self, mock_fetch):
36
36
37
37
mock_fetch .assert_called_once_with (self .collection .id , "lrm_dev" )
38
38
39
- @patch ("sde_collections.models.collection.Collection.create_plugin_config " )
40
- def test_ready_for_curation_triggers_plugin_config (self , mock_plugin ):
41
- """When status changes to READY_FOR_CURATION, it should create plugin config"""
39
+ @patch ("sde_collections.models.collection.Collection.create_indexer_config " )
40
+ def test_ready_for_curation_triggers_indexer_config (self , mock_indexer ):
41
+ """When status changes to READY_FOR_CURATION, it should create indexer config"""
42
42
self .collection .workflow_status = WorkflowStatusChoices .READY_FOR_CURATION
43
43
self .collection .save ()
44
44
45
- mock_plugin .assert_called_once_with (overwrite = True )
45
+ mock_indexer .assert_called_once_with (overwrite = True )
46
46
47
47
@patch ("sde_collections.models.collection.Collection.promote_to_curated" )
48
48
def test_curated_triggers_promotion (self , mock_promote ):
@@ -119,28 +119,292 @@ def test_full_text_import_workflow(self, MockGitHub, MockApi):
119
119
# Include all the fields that convert_template_to_plugin_indexer checks for
120
120
mock_xml = """<?xml version="1.0" encoding="UTF-8"?>
121
121
<Sinequa>
122
- <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
122
+ <connector>crawler2</connector>
123
+ <description></description>
124
+ <identity></identity>
125
+ <indexers></indexers>
126
+ <index></index>
127
+ <domain></domain>
128
+ <treeRoot></treeRoot>
129
+ <Revision>1</Revision>
130
+ <visibility></visibility>
131
+ <ForceReindexation>false</ForceReindexation>
132
+ <Url></Url>
133
+ <Plugin>SMD_Plugins/Sinequa.Plugin.WebCrawler_Index_URLList</Plugin>
134
+ <WorkerCount>3</WorkerCount>
135
+ <MaxWorkerPerHost></MaxWorkerPerHost>
136
+ <UrlList></UrlList>
137
+ <DynamicUrlList></DynamicUrlList>
138
+ <IncludedExtensions></IncludedExtensions>
139
+ <ExcludedExtensions></ExcludedExtensions>
140
+ <IncludedFilenames></IncludedFilenames>
141
+ <ExcludedFilenames></ExcludedFilenames>
142
+ <IncludedFolders></IncludedFolders>
143
+ <ExcludedFolders></ExcludedFolders>
144
+ <EnableNeuralIndexing>true</EnableNeuralIndexing>
145
+ <NeuralSearchSelectionQuery></NeuralSearchSelectionQuery>
146
+ <UrlStayInside>true</UrlStayInside>
147
+ <MaxLevel></MaxLevel>
148
+ <MaxToIndex></MaxToIndex>
149
+ <MaxToCrawl></MaxToCrawl>
150
+ <MaxRedirection></MaxRedirection>
151
+ <CrawlMaxSize></CrawlMaxSize>
152
+ <CrawlTimeout></CrawlTimeout>
153
+ <NormalizeUrls>true</NormalizeUrls>
123
154
<CorrectDomainCookies>false</CorrectDomainCookies>
124
155
<IgnoreSessionCookies>false</IgnoreSessionCookies>
125
- <DownloadImages>false </DownloadImages>
126
- <DownloadMedia>false </DownloadMedia>
156
+ <DownloadImages>true </DownloadImages>
157
+ <DownloadMedia>true </DownloadMedia>
127
158
<DownloadCss>false</DownloadCss>
128
159
<DownloadFtp>true</DownloadFtp>
129
160
<DownloadFile>true</DownloadFile>
130
161
<IndexJs>false</IndexJs>
131
162
<FollowJs>true</FollowJs>
132
163
<CrawlFlash>true</CrawlFlash>
133
- <NormalizeUrls>true</NormalizeUrls>
134
- <NormalizeSecureSchemesWhenTestingVisited>True</NormalizeSecureSchemesWhenTestingVisited>
164
+ <IndexEmptyPages>true</IndexEmptyPages>
165
+ <CrawlWebsphereSeedlist>true</CrawlWebsphereSeedlist>
166
+ <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
167
+ <RetryCount></RetryCount>
168
+ <RetryPause></RetryPause>
169
+ <HttpCodesToRetry></HttpCodesToRetry>
170
+ <UseIfModifiedSince>true</UseIfModifiedSince>
171
+ <UseIfNoneMatch>no</UseIfNoneMatch>
172
+ <AcceptWeakETag>false</AcceptWeakETag>
173
+ <ForcedEncoding></ForcedEncoding>
174
+ <UseCompression>false</UseCompression>
175
+ <UseUnsafeHeaderParsing>false</UseUnsafeHeaderParsing>
176
+ <NormalizeSecureSchemesWhenTestingVisited>false</NormalizeSecureSchemesWhenTestingVisited>
177
+ <ExactDeduplication>false</ExactDeduplication>
178
+ <NearDeduplication>false</NearDeduplication>
179
+ <CrawlPauseDelay></CrawlPauseDelay>
180
+ <CrawlPauseCount></CrawlPauseCount>
181
+ <UseRuntimeAutoRedirect>false</UseRuntimeAutoRedirect>
182
+ <RememberDnsFailure>true</RememberDnsFailure>
183
+ <RememberConnectFailure>true</RememberConnectFailure>
184
+ <RememberTrustFailure>true</RememberTrustFailure>
185
+ <RememberProxyNameResolutionFailure>false</RememberProxyNameResolutionFailure>
186
+ <UseRobotsNoIndex>false</UseRobotsNoIndex>
187
+ <UseRobotsNoFollow>true</UseRobotsNoFollow>
188
+ <UseRobotsTxt>false</UseRobotsTxt>
189
+ <RobotsTxtCaseSensitive>false</RobotsTxtCaseSensitive>
190
+ <LoadRobotsTxtSitemapUrls>false</LoadRobotsTxtSitemapUrls>
191
+ <CheckSitemapUrlLastmodInRealtimeMode>false</CheckSitemapUrlLastmodInRealtimeMode>
192
+ <AddRobotsTxtAllowUrlsToSeedList>false</AddRobotsTxtAllowUrlsToSeedList>
193
+ <UseCanonicalLinks>false</UseCanonicalLinks>
194
+ <UseRelNoFollow>false</UseRelNoFollow>
195
+ <DownloadSelectionQuery></DownloadSelectionQuery>
196
+ <FollowSelectionQuery></FollowSelectionQuery>
197
+ <IndexSelectionQuery></IndexSelectionQuery>
198
+ <LoadDefaultTags>true</LoadDefaultTags>
199
+ <LoadDefaultJsTransforms>true</LoadDefaultJsTransforms>
135
200
<UrlAccess>
201
+ <UseDefaultCredentials>true</UseDefaultCredentials>
202
+ <UseDefaultNetworkCredentials>false</UseDefaultNetworkCredentials>
203
+ <User></User>
204
+ <Password></Password>
205
+ <Domain></Domain>
206
+ <UseRfc1945>false</UseRfc1945>
207
+ <Timeout></Timeout>
208
+ <ChangeConnectionGroupNameOnTimeout>false</ChangeConnectionGroupNameOnTimeout>
209
+ <AllowAuthenticatedConnectionSharing>true</AllowAuthenticatedConnectionSharing>
210
+ <PreAuthenticate>false</PreAuthenticate>
211
+ <HttpVersion></HttpVersion>
212
+ <KeepAlive>true</KeepAlive>
213
+ <SecurityProtocol></SecurityProtocol>
214
+ <UserAgent></UserAgent>
215
+ <ClientCertificateFile></ClientCertificateFile>
216
+ <ClientCertificatePassword></ClientCertificatePassword>
217
+ <ClientCertificateStorage></ClientCertificateStorage>
136
218
<AllowXPathCookies>false</AllowXPathCookies>
137
- <UseBrowserForWebRequests>true</UseBrowserForWebRequests>
138
219
<UseHttpClientForWebRequests>false</UseHttpClientForWebRequests>
220
+ <ThrottleManagerCode>expBackoff+headers</ThrottleManagerCode>
221
+ <UseBrowserForWebRequests>false</UseBrowserForWebRequests>
222
+ <BrowserForWebRequestsReadinessThreshold></BrowserForWebRequestsReadinessThreshold>
223
+ <BrowserForWebRequestsInitialDelay></BrowserForWebRequestsInitialDelay>
224
+ <BrowserForWebRequestsMaxTotalDelay></BrowserForWebRequestsMaxTotalDelay>
225
+ <BrowserForWebRequestsMaxResourcesDelay></BrowserForWebRequestsMaxResourcesDelay>
226
+ <BrowserForWebRequestsLogLevel></BrowserForWebRequestsLogLevel>
227
+ <BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth>
228
+ <BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight>
229
+ <BrowserForWebRequestsAdditionalJavascript></BrowserForWebRequestsAdditionalJavascript>
230
+ <WebConnectionPluginName></WebConnectionPluginName>
231
+ <PostLoginUrl></PostLoginUrl>
232
+ <PostLoginData></PostLoginData>
233
+ <GetBeforePostLogin>false</GetBeforePostLogin>
234
+ <PostLoginAutoRedirect>true</PostLoginAutoRedirect>
235
+ <ReLoginCount></ReLoginCount>
236
+ <ReLoginDelay></ReLoginDelay>
237
+ <DetectHtmlLoginPattern></DetectHtmlLoginPattern>
238
+ <BrowserLogin>
239
+ <Activate>false</Activate>
240
+ <RemoteDebuggingPort></RemoteDebuggingPort>
241
+ <BrowserLogLevel></BrowserLogLevel>
242
+ <ShowDevTools>false</ShowDevTools>
243
+ <SuccessCondition></SuccessCondition>
244
+ <CookieFilter></CookieFilter>
245
+ </BrowserLogin>
246
+ <FtpUser></FtpUser>
247
+ <FtpPassword></FtpPassword>
248
+ <FtpDomain></FtpDomain>
249
+ <FtpUseBinary>true</FtpUseBinary>
250
+ <FtpUsePassive>true</FtpUsePassive>
251
+ <FtpReadWriteTimeout></FtpReadWriteTimeout>
252
+ <FtpTimeout></FtpTimeout>
253
+ <FtpEnableSsl>false</FtpEnableSsl>
254
+ <FileUser></FileUser>
255
+ <FilePassword></FilePassword>
256
+ <FileDomain></FileDomain>
257
+ <FileTimeout></FileTimeout>
258
+ <AmazonS3>
259
+ <AccessKey></AccessKey>
260
+ <SecretKey></SecretKey>
261
+ <RegionEndpoint>eu-west-1</RegionEndpoint>
262
+ <ServiceURL></ServiceURL>
263
+ </AmazonS3>
264
+ <ProxyAutoDetect>true</ProxyAutoDetect>
265
+ <ProxyAddress></ProxyAddress>
266
+ <ProxyBypassOnLocal>true</ProxyBypassOnLocal>
267
+ <ProxyServer></ProxyServer>
268
+ <ProxyPort></ProxyPort>
269
+ <ProxyUseDefaultCredentials>true</ProxyUseDefaultCredentials>
270
+ <ProxyUseDefaultNetworkCredentials>false</ProxyUseDefaultNetworkCredentials>
271
+ <ProxyUser></ProxyUser>
272
+ <ProxyPassword></ProxyPassword>
273
+ <ProxyDomain></ProxyDomain>
139
274
</UrlAccess>
140
- <RetryCount></RetryCount>
141
- <RetryPause></RetryPause>
142
- <AddBaseHref></AddBaseHref>
143
- <AddMetaContentType></AddMetaContentType>
275
+ <System>
276
+ <LogLevel>INFO</LogLevel>
277
+ </System>
278
+ <DisplayLongProperties>false</DisplayLongProperties>
279
+ <LongPropertyLimit></LongPropertyLimit>
280
+ <UsePerformanceMetrics>true</UsePerformanceMetrics>
281
+ <LogPerformanceMetricsPeriodically>false</LogPerformanceMetricsPeriodically>
282
+ <LogPerformanceMetricsPeriod></LogPerformanceMetricsPeriod>
283
+ <PasswordRepository></PasswordRepository>
284
+ <StoreDocumentCache></StoreDocumentCache>
285
+ <AuditEnabled>false</AuditEnabled>
286
+ <SaveDeniedDocs>false</SaveDeniedDocs>
287
+ <SavePropertiesToRegistry>false</SavePropertiesToRegistry>
288
+ <CollectionStateNative>false</CollectionStateNative>
289
+ <HtmlNavigatorNative>true</HtmlNavigatorNative>
290
+ <XPathNavigatorNative>false</XPathNavigatorNative>
291
+ <StatusMaxOk></StatusMaxOk>
292
+ <DelApiSecret></DelApiSecret>
293
+ <IndexerClient>
294
+ <Simulate>false</Simulate>
295
+ <SimulateGetCollectionState>false</SimulateGetCollectionState>
296
+ <QueueMaxCount></QueueMaxCount>
297
+ <SendingThreadFactor></SendingThreadFactor>
298
+ <DirectFileAccess>false</DirectFileAccess>
299
+ <UseCompression>false</UseCompression>
300
+ <SessionIsFinishedWait>false</SessionIsFinishedWait>
301
+ <SendTimeout></SendTimeout>
302
+ <RetryConnectCount></RetryConnectCount>
303
+ <RetryConnectDelay></RetryConnectDelay>
304
+ <RetryTimeout></RetryTimeout>
305
+ <RetrySleep></RetrySleep>
306
+ <DeactivationTimeout></DeactivationTimeout>
307
+ </IndexerClient>
308
+ <Indexation>
309
+ <SimulateLemma>false</SimulateLemma>
310
+ <SimulateEngine>false</SimulateEngine>
311
+ <SimulateCache>false</SimulateCache>
312
+ <SimulateLemmaMin></SimulateLemmaMin>
313
+ <SimulateLemmaMax></SimulateLemmaMax>
314
+ <CollectionStateParallelRowFetch></CollectionStateParallelRowFetch>
315
+ <EngineMetaEnabled>true</EngineMetaEnabled>
316
+ <ThumbnailHeight></ThumbnailHeight>
317
+ <ThumbnailWidth></ThumbnailWidth>
318
+ <ThumbnailSmallTimeout></ThumbnailSmallTimeout>
319
+ <ThumbnailMediumTimeout></ThumbnailMediumTimeout>
320
+ <ThumbnailLargeTimeout></ThumbnailLargeTimeout>
321
+ <SynchThumbnailGen>false</SynchThumbnailGen>
322
+ <StoreInCollectionCache>false</StoreInCollectionCache>
323
+ <GetFilePropertiesFromConverter>false</GetFilePropertiesFromConverter>
324
+ </Indexation>
325
+ <CollectDocumentProperties>true</CollectDocumentProperties>
326
+ <DocCountLimitOnCollectProperties></DocCountLimitOnCollectProperties>
327
+ <ForceBlobSend>false</ForceBlobSend>
328
+ <ContinueOnError>true</ContinueOnError>
329
+ <DoDelete>true</DoDelete>
330
+ <DeleteOnError>false</DeleteOnError>
331
+ <DeleteOnNetworkOrServerError>false</DeleteOnNetworkOrServerError>
332
+ <DeleteOnEnumerationError>false</DeleteOnEnumerationError>
333
+ <AcceptDeleteAll>false</AcceptDeleteAll>
334
+ <DeleteMaxPercentThreshold></DeleteMaxPercentThreshold>
335
+ <DeleteMaxThreshold></DeleteMaxThreshold>
336
+ <DeleteMinRemainingThreshold></DeleteMinRemainingThreshold>
337
+ <SaveCollectionState>false</SaveCollectionState>
338
+ <IncrementalState>false</IncrementalState>
339
+ <RealTimeIncrementalState>true</RealTimeIncrementalState>
340
+ <RealTimeInfoOnError>false</RealTimeInfoOnError>
341
+ <ConversionProxies></ConversionProxies>
342
+ <ConversionPlan></ConversionPlan>
343
+ <AddBaseHref>true</AddBaseHref>
344
+ <AddMetaContentType>false</AddMetaContentType>
345
+ <Throttle></Throttle>
346
+ <DocumentClass></DocumentClass>
347
+ <ConnectorLanguage></ConnectorLanguage>
348
+ <ClearHttpRequestCanonicalizeAsFilePath>true</ClearHttpRequestCanonicalizeAsFilePath>
349
+ <PdfGen>
350
+ <ConverterType></ConverterType>
351
+ <TimeoutSmall></TimeoutSmall>
352
+ <TimeoutMedium></TimeoutMedium>
353
+ <TimeoutLarge></TimeoutLarge>
354
+ </PdfGen>
355
+ <IndexZipContent>false</IndexZipContent>
356
+ <IndexPdfAttachments>false</IndexPdfAttachments>
357
+ <IndexOleAttachments>false</IndexOleAttachments>
358
+ <IndexMsgContent>false</IndexMsgContent>
359
+ <IndexMsgAttachments>false</IndexMsgAttachments>
360
+ <IndexOftContent>false</IndexOftContent>
361
+ <IndexOftAttachments>false</IndexOftAttachments>
362
+ <IndexEmlContent>false</IndexEmlContent>
363
+ <IndexEmlAttachments>false</IndexEmlAttachments>
364
+ <IndexPstContent>false</IndexPstContent>
365
+ <IndexOstContent>false</IndexOstContent>
366
+ <IndexPstMsg>true</IndexPstMsg>
367
+ <IndexPstMsgAttachments>true</IndexPstMsgAttachments>
368
+ <IndexPstContact>false</IndexPstContact>
369
+ <IndexPstCalendar>false</IndexPstCalendar>
370
+ <IndexPstNote>false</IndexPstNote>
371
+ <IndexPstTask>false</IndexPstTask>
372
+ <IndexPstDocument>true</IndexPstDocument>
373
+ <PstUseSafeId>false</PstUseSafeId>
374
+ <IndexArchivesExtensions></IndexArchivesExtensions>
375
+ <ArchiveItemsUseArchiveVersion>false</ArchiveItemsUseArchiveVersion>
376
+ <UseShortAttachmentId>false</UseShortAttachmentId>
377
+ <UseExtendedExtensionGuesser>false</UseExtendedExtensionGuesser>
378
+ <AlwaysScanContainerFiles>false</AlwaysScanContainerFiles>
379
+ <XmpExtensions></XmpExtensions>
380
+ <MediaExtensions></MediaExtensions>
381
+ <ExiftoolExtensions></ExiftoolExtensions>
382
+ <EarlySelectionQuery></EarlySelectionQuery>
383
+ <SelectionQuery></SelectionQuery>
384
+ <AttachmentSelectionQuery></AttachmentSelectionQuery>
385
+ <ArchiveItemSelectionQuery></ArchiveItemSelectionQuery>
386
+ <EngineConnectionWait></EngineConnectionWait>
387
+ <FetchCollectionDataDirectlyFromEngine></FetchCollectionDataDirectlyFromEngine>
388
+ <CalculateGraphBoost>false</CalculateGraphBoost>
389
+ <GraphBoostColumn></GraphBoostColumn>
390
+ <GraphBoostEMColumn></GraphBoostEMColumn>
391
+ <GraphBoostIterations></GraphBoostIterations>
392
+ <GraphBoostPower></GraphBoostPower>
393
+ <GraphBoostAdd></GraphBoostAdd>
394
+ <UseFieldPermissions>false</UseFieldPermissions>
395
+ <ShardIndexes></ShardIndexes>
396
+ <ShardingStrategy></ShardingStrategy>
397
+ <ShardSelections></ShardSelections>
398
+ <CurationType></CurationType>
399
+ <CurationIdPattern></CurationIdPattern>
400
+ <RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
401
+ <Namespace></Namespace>
402
+ <Mapping>
403
+ <Name>id</Name>
404
+ <Value>doc.url1</Value>
405
+ </Mapping>
406
+ <UrlRefererStayInside>false</UrlRefererStayInside>
407
+ <FollowLinks>false</FollowLinks>
144
408
</Sinequa>"""
145
409
mock_file_contents .decoded_content = mock_xml .encode ("utf-8" )
146
410
mock_github ._get_file_contents .return_value = mock_file_contents
0 commit comments