Skip to content

Commit 3d11f2e

Browse files
Merge branch 'dev' into 1030-resolve-0-value-document-type-in-nasa_science
2 parents a4f3cb3 + 16aae30 commit 3d11f2e

File tree

7 files changed

+82
-10
lines changed

7 files changed

+82
-10
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ For each PR made, an entry should be added to this changelog. It should contain
5151
- Changes:
5252
- Added `obj.document_type != 0` as a condition in the `get_document_type` method within the `CuratedURLAPISerializer`
5353

54+
- 1014-add-logs-when-importing-urls-so-we-know-how-many-were-expected-how-many-succeeded-and-how-many-failed
55+
- Description: When URLs of a given collection are imported into COSMOS, a Slack notification is sent. This notification includes the name of the collection imported,count of the existing curated URLs, total URLs count as per the server, URLs successfully imported from the server, delta URLs identified and delta URLs marked for deletion.
56+
- Changes:
57+
- The get_full_texts() function in sde_collections/sinequa_api.py is updated to yeild total_count along with rows.
58+
- fetch_and_replace_full_text() function in sde_collections/tasks.py captures the total_server_count and triggers send_detailed_import_notification().
59+
- Added a function send_detailed_import_notification() in sde_collections/utils/slack_utils.py to structure the notification to be sent.
60+
- Updated the associated tests effected due to inclusion of this functionality.
61+
5462
- 3228-bugfix-preserve-scroll-position--document-type-selection-behavior-on-individual-urls
5563
- Description: Upon selecting a document type on any individual URL, the page refreshes and returns to the top. This is not necessarily a bug but an inconvenience, especially when working at the bottom of the page. Fix the JS code.
5664
- Changes:

sde_collections/sinequa_api.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def get_full_texts(
257257
if total_count is None:
258258
total_count = response.get("TotalRowCount", 0)
259259

260-
yield self._process_rows_to_records(rows)
260+
yield (self._process_rows_to_records(rows))
261261

262262
current_offset += len(rows)
263263

@@ -275,6 +275,35 @@ def get_full_texts(
275275
print(f"Reducing batch size to {current_batch_size} and retrying...")
276276
continue
277277

278+
def get_total_count(self, collection_config_folder: str, source: str = None) -> int:
279+
"""
280+
Retrieves the total count of records for a given collection using Sinequa's TotalRowCount metadata.
281+
282+
Args:
283+
collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "CASEI").
284+
source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
285+
or "SDE" for other servers.
286+
287+
Returns:
288+
int: The total number of records in the collection.
289+
"""
290+
if not source:
291+
source = self._get_source_name()
292+
293+
if (index := self.config.get("index")) is None:
294+
raise ValueError(
295+
f"Configuration error: Index not defined for server '{self.server_name}'. "
296+
"Please update server configuration with the required index."
297+
)
298+
299+
# Minimal query to get only metadata, no data retrieval
300+
sql = f"SELECT * FROM {index} WHERE collection = '/{source}/{collection_config_folder}/' SKIP 0 COUNT 0"
301+
302+
response = self._execute_sql_query(sql)
303+
304+
# Extract TotalRowCount from metadata
305+
return response.get("TotalRowCount", 0)
306+
278307
@staticmethod
279308
def _process_full_text_response(batch_data: dict):
280309
if "Rows" not in batch_data or not isinstance(batch_data["Rows"], list):

sde_collections/tasks.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
ReindexingStatusChoices,
1515
WorkflowStatusChoices,
1616
)
17+
from sde_collections.utils import slack_utils
1718

18-
from .models.delta_url import DumpUrl
19+
from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
1920
from .sinequa_api import Api
2021
from .utils.github_helper import GitHubHandler
2122

@@ -173,8 +174,10 @@ def fetch_and_replace_full_text(collection_id, server_name):
173174
# Step 1: Delete existing DumpUrl entries
174175
deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
175176
print(f"Deleted {deleted_count} old records.")
176-
177177
try:
178+
total_server_count = api.get_total_count(collection.config_folder)
179+
print(f"Total records on the server: {total_server_count}")
180+
178181
# Step 2: Process data in batches
179182
total_processed = 0
180183
for batch in api.get_full_texts(collection.config_folder):
@@ -193,9 +196,15 @@ def fetch_and_replace_full_text(collection_id, server_name):
193196
total_processed += len(batch)
194197
print(f"Processed batch of {len(batch)} records. Total: {total_processed}")
195198

199+
dump_count = DumpUrl.objects.filter(collection=collection).count()
200+
196201
# Step 3: Migrate dump URLs to delta URLs
197202
collection.migrate_dump_to_delta()
198203

204+
curated_count = CuratedUrl.objects.filter(collection=collection).count()
205+
delta_count = DeltaUrl.objects.filter(collection=collection).count()
206+
marked_for_deletion_count = DeltaUrl.objects.filter(collection=collection, to_delete=True).count()
207+
199208
# Step 4: Update statuses if needed
200209
collection.refresh_from_db()
201210

@@ -215,6 +224,9 @@ def fetch_and_replace_full_text(collection_id, server_name):
215224
collection.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
216225
collection.save()
217226

227+
slack_utils.send_detailed_import_notification(
228+
collection.name, total_server_count, curated_count, dump_count, delta_count, marked_for_deletion_count
229+
)
218230
return f"Successfully processed {total_processed} records and updated the database."
219231

220232
except Exception as e:

sde_collections/tests/test_import_fulltexts.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@ def test_fetch_and_replace_full_text(disconnect_signals):
3030
]
3131

3232
def mock_generator():
33-
yield mock_batch
33+
yield (mock_batch)
3434

35-
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts:
35+
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts, patch(
36+
"sde_collections.sinequa_api.Api.get_total_count", return_value=2
37+
), patch("sde_collections.utils.slack_utils.send_detailed_import_notification"):
3638
mock_get_full_texts.return_value = mock_generator()
3739

3840
fetch_and_replace_full_text(collection.id, "lrm_dev")
@@ -59,9 +61,11 @@ def mock_batch_generator():
5961
total_records = 20000
6062

6163
for start in range(0, total_records, batch_size):
62-
yield create_batch(start, min(batch_size, total_records - start))
64+
yield (create_batch(start, min(batch_size, total_records - start)))
6365

64-
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts:
66+
with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts, patch(
67+
"sde_collections.sinequa_api.Api.get_total_count", return_value=20000
68+
), patch("sde_collections.utils.slack_utils.send_detailed_import_notification"):
6569
mock_get_full_texts.return_value = mock_batch_generator()
6670

6771
# Execute the task

sde_collections/tests/test_sinequa_api.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ def test_get_full_texts_pagination(self, mock_execute_sql, api_instance):
170170

171171
# Collect all batches from the iterator
172172
batches = list(api_instance.get_full_texts("test_folder"))
173-
174173
assert len(batches) == 2 # Should have two batches
175174
assert len(batches[0]) == 2 # First batch has 2 records
176175
assert len(batches[1]) == 1 # Second batch has 1 record

sde_collections/tests/test_workflow_status_triggers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,10 @@ def setUp(self):
108108
{"url": "http://example.com/2", "title": "Title 2", "full_text": "Content 2"},
109109
]
110110

111+
@patch("sde_collections.utils.slack_utils.send_detailed_import_notification")
111112
@patch("sde_collections.tasks.Api")
112113
@patch("sde_collections.models.collection.GitHubHandler")
113-
def test_full_text_import_workflow(self, MockGitHub, MockApi):
114+
def test_full_text_import_workflow(self, MockGitHub, MockApi, MockSlackNotification):
114115
"""Test the full process of importing full text data"""
115116
# Setup mock GitHub handler with proper XML content
116117
mock_github = Mock()
@@ -412,7 +413,7 @@ def test_full_text_import_workflow(self, MockGitHub, MockApi):
412413

413414
# Setup mock API
414415
mock_api = Mock()
415-
mock_api.get_full_texts.return_value = [self.api_response]
416+
mock_api.get_full_texts.return_value = iter([self.api_response])
416417
MockApi.return_value = mock_api
417418

418419
# Setup initial workflow state

sde_collections/utils/slack_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,25 @@ def format_slack_message(name, details, collection_id):
5959
return message_template.format(name=linked_name)
6060

6161

62+
def send_detailed_import_notification(
63+
collection_name, total_server_count, curated_count, dump_count, delta_count, marked_for_deletion_count
64+
):
65+
message = (
66+
f"'{collection_name}' brought into COSMOS.\n"
67+
f"Prior Curated: {curated_count}\n"
68+
f"Server Count: {total_server_count}\n"
69+
f"URLs Imported: {dump_count}\n"
70+
f"New Deltas: {delta_count}\n"
71+
f"Marked For Deletion: {marked_for_deletion_count}\n"
72+
)
73+
74+
webhook_url = settings.SLACK_WEBHOOK_URL
75+
payload = {"text": message}
76+
response = requests.post(webhook_url, json=payload)
77+
if response.status_code != 200:
78+
print(f"Error sending Slack message: {response.text}")
79+
80+
6281
def send_slack_message(message):
6382
webhook_url = settings.SLACK_WEBHOOK_URL
6483
payload = {"text": message}

0 commit comments

Comments
 (0)