NASA-IMPACT
diff --git a/‎sde_collections/sinequa_api.py‎
Lines changed: 106 additions & 73 deletions b/‎sde_collections/sinequa_api.py‎
Lines changed: 106 additions & 73 deletions
diff --git a/‎sde_collections/tasks.py‎
Lines changed: 34 additions & 16 deletions b/‎sde_collections/tasks.py‎
Lines changed: 34 additions & 16 deletions
@@ -1,12 +1,10 @@
 import json
+from collections.abc import Iterator
 from typing import Any
 
 import requests
 import urllib3
 from django.conf import settings
-from django.db import transaction
-
-from .models.delta_url import DumpUrl
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -138,85 +136,99 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
 
         return self.process_response(url, payload)
 
-    def sql_query(self, sql: str, collection) -> Any:
+    def _execute_sql_query(self, sql: str) -> dict:
+        """
+        Executes a SQL query against the Sinequa API.
+
+        Args:
+            sql (str): The SQL query to execute
+
+        Returns:
+            dict: The JSON response from the API containing 'Rows' and 'TotalRowCount'
+
+        Raises:
+            ValueError: If no token is available for authentication
+        """
         token = self._get_token()
         if not token:
             raise ValueError("Authentication error: Token is required for SQL endpoint access")
 
-        page = 0
-        page_size = 5000  # Number of records per page
-        skip_records = 0
+        url = f"{self.base_url}/api/v1/engine.sql"
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
+        raw_payload = json.dumps(
+            {
+                "method": "engine.sql",
+                "sql": sql,
+                "pretty": True,
+            }
+        )
 
-        while True:
-            paginated_sql = f"{sql} SKIP {skip_records} COUNT {page_size}"
-            url = f"{self.base_url}/api/v1/engine.sql"
-            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-            raw_payload = json.dumps(
-                {
-                    "method": "engine.sql",
-                    "sql": paginated_sql,
-                    "pretty": True,
-                }
-            )
+        return self.process_response(url, headers=headers, raw_data=raw_payload)
 
-            response = self.process_response(url, headers=headers, raw_data=raw_payload)
-            batch_data = response.get("Rows", [])
-            total_row_count = response.get("TotalRowCount", 0)
-            processed_response = self._process_full_text_response(response)
-            self.process_and_update_data(processed_response, collection)
-            print(f"Batch {page + 1} has been processed and updated")
+    def _process_rows_to_records(self, rows: list) -> list[dict]:
+        """
+        Converts raw SQL row data into structured record dictionaries.
 
-            # Check if all rows have been fetched
-            if len(batch_data) == 0 or (skip_records + page_size) >= total_row_count:
-                break
+        Args:
+            rows (list): List of rows, where each row is [url, full_text, title]
 
-            page += 1
-            skip_records += page_size
-
-        return f"All {total_row_count} records have been processed and updated."
-
-    def process_and_update_data(self, batch_data, collection):
-        for record in batch_data:
-            try:
-                with transaction.atomic():
-                    url = record["url"]
-                    scraped_text = record.get("full_text", "")
-                    scraped_title = record.get("title", "")
-                    DumpUrl.objects.update_or_create(
-                        url=url,
-                        defaults={
-                            "scraped_text": scraped_text,
-                            "scraped_title": scraped_title,
-                            "collection": collection,
-                        },
-                    )
-            except KeyError as e:
-                print(f"Missing key in data: {str(e)}")
-            except Exception as e:
-                print(f"Error processing record: {str(e)}")
-
-    def get_full_texts(self, collection_config_folder: str, source: str = None, collection=None) -> Any:
+        Returns:
+            list[dict]: List of processed records with url, full_text, and title keys
+
+        Raises:
+            ValueError: If any row doesn't contain exactly 3 elements
+        """
+        processed_records = []
+        for idx, row in enumerate(rows):
+            if len(row) != 3:
+                raise ValueError(
+                    f"Invalid row format at index {idx}: Expected exactly three elements (url, full_text, title). "
+                    f"Received {len(row)} elements."
+                )
+            processed_records.append({"url": row[0], "full_text": row[1], "title": row[2]})
+        return processed_records
+
+    def get_full_texts(self, collection_config_folder: str, source: str = None) -> Iterator[dict]:
         """
-        Retrieves the full texts, URLs, and titles for a specified collection.
+        Retrieves and yields batches of text records from the SQL database for a given collection.
+        Uses pagination to handle large datasets efficiently.
 
-        Returns:
-            dict: A JSON response containing the results of the SQL query,
-                where each item has 'url', 'text', and 'title'.
-
-        Example:
-            Calling get_full_texts("example_collection") might return:
-                [
-                    {
-                        'url': 'http://example.com/article1',
-                        'text': 'Here is the full text of the first article...',
-                        'title': 'Article One Title'
-                    },
-                    {
-                        'url': 'http://example.com/article2',
-                        'text': 'Here is the full text of the second article...',
-                        'title': 'Article Two Title'
-                    }
-                ]
+        Args:
+            collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "SMD")
+            source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
+                or "SDE" for other servers.
+
+        Yields:
+            list[dict]: Batches of records, where each record is a dictionary containing:
+                {
+                    "url": str,       # The URL of the document
+                    "full_text": str, # The full text content of the document
+                    "title": str      # The title of the document
+                }
+
+        Raises:
+            ValueError: If the server's index is not defined in its configuration
+
+        Example batch:
+            [
+                {
+                    "url": "https://example.nasa.gov/doc1",
+                    "full_text": "This is the content of doc1...",
+                    "title": "Document 1 Title"
+                },
+                {
+                    "url": "https://example.nasa.gov/doc2",
+                    "full_text": "This is the content of doc2...",
+                    "title": "Document 2 Title"
+                }
+            ]
+
+        Note:
+            - Results are paginated in batches of 5000 records
+            - Each batch is processed into clean dictionaries before being yielded
+            - The iterator will stop when either:
+                1. No more rows are returned from the query
+                2. The total count of records has been reached
         """
 
         if not source:
@@ -229,7 +241,28 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
             )
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
-        return self.sql_query(sql, collection)
+
+        page = 0
+        page_size = 5000
+        total_processed = 0
+
+        while True:
+            paginated_sql = f"{sql} SKIP {total_processed} COUNT {page_size}"
+            response = self._execute_sql_query(paginated_sql)
+
+            rows = response.get("Rows", [])
+            if not rows:  # Stop if we get an empty batch
+                break
+
+            yield self._process_rows_to_records(rows)
+
+            total_processed += len(rows)
+            total_count = response.get("TotalRowCount", 0)
+
+            if total_processed >= total_count:  # Stop if we've processed all records
+                break
+
+            page += 1
 
     @staticmethod
     def _process_full_text_response(batch_data: dict):
 
@@ -7,6 +7,7 @@
 from django.conf import settings
 from django.core import management
 from django.core.management.commands import loaddata
+from django.db import transaction
 
 from config import celery_app
 
@@ -147,26 +148,43 @@ def resolve_title_pattern(title_pattern_id):
 @celery_app.task(soft_time_limit=600)
 def fetch_and_replace_full_text(collection_id, server_name):
     """
-    Task to initiate fetching and replacing full text and metadata for all URLs associated with a specified collection
-    from a given server.
-    Args:
-        collection_id (int): The identifier for the collection in the database.
-        server_name (str): The name of the server.
-
-    Returns:
-        str: A message indicating the result of the operation, including the number of URLs processed.
+    Task to fetch and replace full text and metadata for a collection.
+    Handles data in batches to manage memory usage.
     """
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
 
-    # Step 1: Delete all existing DumpUrl entries for the collection
+    # Step 1: Delete existing DumpUrl entries
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
     print(f"Deleted {deleted_count} old records.")
 
-    # Step 2: Fetch and process new data
-    result_message = api.get_full_texts(collection.config_folder, collection=collection)
-
-    # Step 3: Migrate DumpUrl to DeltaUrl
-    collection.migrate_dump_to_delta()
-
-    return result_message
+    # Step 2: Process data in batches
+    total_processed = 0
+
+    try:
+        for batch in api.get_full_texts(collection.config_folder):
+            # Use bulk_create for efficiency, with a transaction per batch
+            with transaction.atomic():
+                DumpUrl.objects.bulk_create(
+                    [
+                        DumpUrl(
+                            url=record["url"],
+                            collection=collection,
+                            scraped_text=record["full_text"],
+                            scraped_title=record["title"],
+                        )
+                        for record in batch
+                    ]
+                )
+
+            total_processed += len(batch)
+            print(f"Processed batch of {len(batch)} records. Total: {total_processed}")
+
+        # Step 3: Migrate dump URLs to delta URLs
+        collection.migrate_dump_to_delta()
+
+        return f"Successfully processed {total_processed} records and updated the database."
+
+    except Exception as e:
+        print(f"Error processing records: {str(e)}")
+        raise