Skip to content

Commit 96430de

Browse files
author
Your Name
committed
Added a separate get_total_count() function
1 parent 1d4ba0c commit 96430de

File tree

5 files changed

+44
-18
lines changed

5 files changed

+44
-18
lines changed

sde_collections/sinequa_api.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def get_full_texts(
257257
if total_count is None:
258258
total_count = response.get("TotalRowCount", 0)
259259

260-
yield (self._process_rows_to_records(rows), total_count)
260+
yield (self._process_rows_to_records(rows))
261261

262262
current_offset += len(rows)
263263

@@ -275,6 +275,35 @@ def get_full_texts(
275275
print(f"Reducing batch size to {current_batch_size} and retrying...")
276276
continue
277277

278+
def get_total_count(self, collection_config_folder: str, source: str = None) -> int:
279+
"""
280+
Retrieves the total count of records for a given collection using Sinequa's TotalRowCount metadata.
281+
282+
Args:
283+
collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "CASEI").
284+
source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
285+
or "SDE" for other servers.
286+
287+
Returns:
288+
int: The total number of records in the collection.
289+
"""
290+
if not source:
291+
source = self._get_source_name()
292+
293+
if (index := self.config.get("index")) is None:
294+
raise ValueError(
295+
f"Configuration error: Index not defined for server '{self.server_name}'. "
296+
"Please update server configuration with the required index."
297+
)
298+
299+
# Minimal query to get only metadata, no data retrieval
300+
sql = f"SELECT * FROM {index} WHERE collection = '/{source}/{collection_config_folder}/' SKIP 0 COUNT 0"
301+
302+
response = self._execute_sql_query(sql)
303+
304+
# Extract TotalRowCount from metadata
305+
return response.get("TotalRowCount", 0)
306+
278307
@staticmethod
279308
def _process_full_text_response(batch_data: dict):
280309
if "Rows" not in batch_data or not isinstance(batch_data["Rows"], list):

sde_collections/tasks.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,13 @@ def fetch_and_replace_full_text(collection_id, server_name):
174174
# Step 1: Delete existing DumpUrl entries
175175
deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
176176
print(f"Deleted {deleted_count} old records.")
177-
total_server_count = 0
178177
try:
178+
total_server_count = api.get_total_count(collection.config_folder)
179+
print(f"Total records on the server: {total_server_count}")
180+
179181
# Step 2: Process data in batches
180182
total_processed = 0
181-
for batch, total_count in api.get_full_texts(collection.config_folder):
182-
if total_server_count == 0:
183-
total_server_count = total_count
183+
for batch in api.get_full_texts(collection.config_folder):
184184
with transaction.atomic():
185185
DumpUrl.objects.bulk_create(
186186
[

sde_collections/tests/test_import_fulltexts.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_fetch_and_replace_full_text(disconnect_signals):
3030
]
3131

3232
def mock_generator():
33-
yield (mock_batch, len(mock_batch))
33+
yield (mock_batch)
3434

3535
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts, patch(
3636
"sde_collections.utils.slack_utils.send_detailed_import_notification"
@@ -61,7 +61,7 @@ def mock_batch_generator():
6161
total_records = 20000
6262

6363
for start in range(0, total_records, batch_size):
64-
yield (create_batch(start, min(batch_size, total_records - start)), total_records)
64+
yield (create_batch(start, min(batch_size, total_records - start)))
6565

6666
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts, patch(
6767
"sde_collections.utils.slack_utils.send_detailed_import_notification"

sde_collections/tests/test_sinequa_api.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,17 +170,16 @@ def test_get_full_texts_pagination(self, mock_execute_sql, api_instance):
170170

171171
# Collect all batches from the iterator
172172
batches = list(api_instance.get_full_texts("test_folder"))
173-
records_batches = [batch[0] for batch in batches]
174-
assert len(records_batches) == 2 # Should have two batches
175-
assert len(records_batches[0]) == 2 # First batch has 2 records
176-
assert len(records_batches[1]) == 1 # Second batch has 1 record
173+
assert len(batches) == 2 # Should have two batches
174+
assert len(batches[0]) == 2 # First batch has 2 records
175+
assert len(batches[1]) == 1 # Second batch has 1 record
177176

178177
# Verify content of batches
179-
assert records_batches[0] == [
178+
assert batches[0] == [
180179
{"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"},
181180
{"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"},
182181
]
183-
assert records_batches[1] == [{"url": "http://example.com/3", "full_text": "Text 3", "title": "Title 3"}]
182+
assert batches[1] == [{"url": "http://example.com/3", "full_text": "Text 3", "title": "Title 3"}]
184183

185184
def test_get_full_texts_missing_index(self, api_instance):
186185
"""
@@ -249,12 +248,11 @@ def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instanc
249248
]
250249

251250
batches = list(api_instance.get_full_texts("test_folder", batch_size=100, min_batch_size=1))
252-
records_batches = [batch[0] for batch in batches]
253251

254252
# Verify the batches were processed correctly after size reduction
255253
assert len(batches) == 1
256-
assert len(records_batches[0]) == 1
257-
assert records_batches[0][0]["url"] == "http://example.com/1"
254+
assert len(batches[0]) == 1
255+
assert batches[0][0]["url"] == "http://example.com/1"
258256

259257
# Verify batch size reduction logic
260258
assert mock_execute_sql.call_count == 2

sde_collections/tests/test_workflow_status_triggers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,7 @@ def test_full_text_import_workflow(self, MockGitHub, MockApi, MockSlackNotificat
149149

150150
# Setup mock API
151151
mock_api = Mock()
152-
total_count = len(self.api_response)
153-
mock_api.get_full_texts.return_value = iter([(self.api_response, total_count)])
152+
mock_api.get_full_texts.return_value = iter([self.api_response])
154153
MockApi.return_value = mock_api
155154

156155
# Setup initial workflow state

0 commit comments

Comments
 (0)