@@ -188,15 +188,26 @@ def _process_rows_to_records(self, rows: list) -> list[dict]:
188
188
processed_records .append ({"url" : row [0 ], "full_text" : row [1 ], "title" : row [2 ]})
189
189
return processed_records
190
190
191
- def get_full_texts (self , collection_config_folder : str , source : str = None ) -> Iterator [dict ]:
191
+ def get_full_texts (
192
+ self ,
193
+ collection_config_folder : str ,
194
+ source : str = None ,
195
+ start_at : int = 0 ,
196
+ batch_size : int = 500 ,
197
+ min_batch_size : int = 1 ,
198
+ ) -> Iterator [dict ]:
192
199
"""
193
200
Retrieves and yields batches of text records from the SQL database for a given collection.
194
- Uses pagination to handle large datasets efficiently.
201
+ Uses pagination to handle large datasets efficiently. If a query fails, it automatically
202
+ reduces the batch size and retries, with the ability to recover batch size after successful queries.
195
203
196
204
Args:
197
- collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "SMD ")
205
+ collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "CASEI ")
198
206
source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
199
207
or "SDE" for other servers.
208
+ start_at (int, optional): Starting offset for records. Defaults to 0.
209
+ page_size (int, optional): Initial number of records per batch. Defaults to 500.
210
+ min_batch_size (int, optional): Minimum batch size before giving up. Defaults to 1.
200
211
201
212
Yields:
202
213
list[dict]: Batches of records, where each record is a dictionary containing:
@@ -208,29 +219,16 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> I
208
219
209
220
Raises:
210
221
ValueError: If the server's index is not defined in its configuration
211
-
212
- Example batch:
213
- [
214
- {
215
- "url": "https://example.nasa.gov/doc1",
216
- "full_text": "This is the content of doc1...",
217
- "title": "Document 1 Title"
218
- },
219
- {
220
- "url": "https://example.nasa.gov/doc2",
221
- "full_text": "This is the content of doc2...",
222
- "title": "Document 2 Title"
223
- }
224
- ]
222
+ ValueError: If batch size reaches minimum without success
225
223
226
224
Note:
227
- - Results are paginated in batches of 5000 records
225
+ - Results are paginated with adaptive batch sizing
228
226
- Each batch is processed into clean dictionaries before being yielded
229
227
- The iterator will stop when either:
230
228
1. No more rows are returned from the query
231
229
2. The total count of records has been reached
230
+ - Batch size will decrease on failure and can recover after successful queries
232
231
"""
233
-
234
232
if not source :
235
233
source = self ._get_source_name ()
236
234
@@ -240,29 +238,42 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> I
240
238
"Please update server configuration with the required index."
241
239
)
242
240
243
- sql = f"SELECT url1, text, title FROM { index } WHERE collection = '/{ source } /{ collection_config_folder } /'"
241
+ base_sql = f"SELECT url1, text, title FROM { index } WHERE collection = '/{ source } /{ collection_config_folder } /'"
244
242
245
- page = 0
246
- page_size = 5000
247
- total_processed = 0
243
+ current_offset = start_at
244
+ current_batch_size = batch_size
245
+ total_count = None
248
246
249
247
while True :
250
- paginated_sql = f"{ sql } SKIP { total_processed } COUNT { page_size } "
251
- response = self ._execute_sql_query (paginated_sql )
248
+ sql = f"{ base_sql } SKIP { current_offset } COUNT { current_batch_size } "
249
+
250
+ try :
251
+ response = self ._execute_sql_query (sql )
252
+ rows = response .get ("Rows" , [])
253
+
254
+ if not rows : # Stop if we get an empty batch
255
+ break
256
+
257
+ if total_count is None :
258
+ total_count = response .get ("TotalRowCount" , 0 )
252
259
253
- rows = response .get ("Rows" , [])
254
- if not rows : # Stop if we get an empty batch
255
- break
260
+ yield self ._process_rows_to_records (rows )
256
261
257
- yield self . _process_rows_to_records (rows )
262
+ current_offset += len (rows )
258
263
259
- total_processed += len ( rows )
260
- total_count = response . get ( "TotalRowCount" , 0 )
264
+ if total_count and current_offset >= total_count : # Stop if we've processed all records
265
+ break
261
266
262
- if total_processed >= total_count : # Stop if we've processed all records
263
- break
267
+ except (requests .RequestException , ValueError ) as e :
268
+ if current_batch_size <= min_batch_size :
269
+ raise ValueError (
270
+ f"Failed to process batch even at minimum size { min_batch_size } . " f"Last error: { str (e )} "
271
+ )
264
272
265
- page += 1
273
+ # Halve the batch size and retry
274
+ current_batch_size = max (current_batch_size // 2 , min_batch_size )
275
+ print (f"Reducing batch size to { current_batch_size } and retrying..." )
276
+ continue
266
277
267
278
@staticmethod
268
279
def _process_full_text_response (batch_data : dict ):
0 commit comments