1
1
import json
2
+ from collections .abc import Iterator
2
3
from typing import Any
3
4
4
5
import requests
@@ -61,15 +62,14 @@ class Api:
61
62
def __init__ (self , server_name : str = None , user : str = None , password : str = None , token : str = None ) -> None :
62
63
self .server_name = server_name
63
64
if server_name not in server_configs :
64
- raise ValueError (f"Server name '{ server_name } ' is not in server_configs " )
65
+ raise ValueError (f"Invalid server configuration: '{ server_name } ' is not a recognized server name " )
65
66
66
67
self .config = server_configs [server_name ]
67
68
self .app_name : str = self .config ["app_name" ]
68
69
self .query_name : str = self .config ["query_name" ]
69
70
self .base_url : str = self .config ["base_url" ]
70
71
self .dev_servers = ["xli" , "lrm_dev" , "lrm_qa" ]
71
72
72
- # Store provided values only
73
73
self ._provided_user = user
74
74
self ._provided_password = password
75
75
self ._provided_token = token
@@ -113,7 +113,8 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
113
113
password = self ._get_password ()
114
114
if not user or not password :
115
115
raise ValueError (
116
- "User and password are required for the query endpoint on the following servers: {self.dev_servers}"
116
+ f"Authentication error: Missing credentials for dev server '{ self .server_name } '. "
117
+ f"Both username and password are required for servers: { ', ' .join (self .dev_servers )} "
117
118
)
118
119
authentication = f"?Password={ password } &User={ user } "
119
120
url = f"{ url } { authentication } "
@@ -135,11 +136,22 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
135
136
136
137
return self .process_response (url , payload )
137
138
138
- def sql_query (self , sql : str ) -> Any :
139
- """Executes an SQL query on the configured server using token-based authentication."""
139
+ def _execute_sql_query (self , sql : str ) -> dict :
140
+ """
141
+ Executes a SQL query against the Sinequa API.
142
+
143
+ Args:
144
+ sql (str): The SQL query to execute
145
+
146
+ Returns:
147
+ dict: The JSON response from the API containing 'Rows' and 'TotalRowCount'
148
+
149
+ Raises:
150
+ ValueError: If no token is available for authentication
151
+ """
140
152
token = self ._get_token ()
141
153
if not token :
142
- raise ValueError ("A token is required to use the SQL endpoint" )
154
+ raise ValueError ("Authentication error: Token is required for SQL endpoint access " )
143
155
144
156
url = f"{ self .base_url } /api/v1/engine.sql"
145
157
headers = {"Content-Type" : "application/json" , "Authorization" : f"Bearer { token } " }
@@ -153,42 +165,120 @@ def sql_query(self, sql: str) -> Any:
153
165
154
166
return self .process_response (url , headers = headers , raw_data = raw_payload )
155
167
156
- def get_full_texts (self , collection_config_folder : str , source : str = None ) -> Any :
168
+ def _process_rows_to_records (self , rows : list ) -> list [ dict ] :
157
169
"""
158
- Retrieves the full texts, URLs, and titles for a specified collection.
170
+ Converts raw SQL row data into structured record dictionaries.
171
+
172
+ Args:
173
+ rows (list): List of rows, where each row is [url, full_text, title]
159
174
160
175
Returns:
161
- dict: A JSON response containing the results of the SQL query,
162
- where each item has 'url', 'text', and 'title'.
163
-
164
- Example:
165
- Calling get_full_texts("example_collection") might return:
166
- [
167
- {
168
- 'url': 'http://example.com/article1',
169
- 'text': 'Here is the full text of the first article...',
170
- 'title': 'Article One Title'
171
- },
172
- {
173
- 'url': 'http://example.com/article2',
174
- 'text': 'Here is the full text of the second article...',
175
- 'title': 'Article Two Title'
176
- }
177
- ]
176
+ list[dict]: List of processed records with url, full_text, and title keys
177
+
178
+ Raises:
179
+ ValueError: If any row doesn't contain exactly 3 elements
180
+ """
181
+ processed_records = []
182
+ for idx , row in enumerate (rows ):
183
+ if len (row ) != 3 :
184
+ raise ValueError (
185
+ f"Invalid row format at index { idx } : Expected exactly three elements (url, full_text, title). "
186
+ f"Received { len (row )} elements."
187
+ )
188
+ processed_records .append ({"url" : row [0 ], "full_text" : row [1 ], "title" : row [2 ]})
189
+ return processed_records
190
+
191
+ def get_full_texts (self , collection_config_folder : str , source : str = None ) -> Iterator [dict ]:
192
+ """
193
+ Retrieves and yields batches of text records from the SQL database for a given collection.
194
+ Uses pagination to handle large datasets efficiently.
195
+
196
+ Args:
197
+ collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "SMD")
198
+ source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
199
+ or "SDE" for other servers.
200
+
201
+ Yields:
202
+ list[dict]: Batches of records, where each record is a dictionary containing:
203
+ {
204
+ "url": str, # The URL of the document
205
+ "full_text": str, # The full text content of the document
206
+ "title": str # The title of the document
207
+ }
208
+
209
+ Raises:
210
+ ValueError: If the server's index is not defined in its configuration
211
+
212
+ Example batch:
213
+ [
214
+ {
215
+ "url": "https://example.nasa.gov/doc1",
216
+ "full_text": "This is the content of doc1...",
217
+ "title": "Document 1 Title"
218
+ },
219
+ {
220
+ "url": "https://example.nasa.gov/doc2",
221
+ "full_text": "This is the content of doc2...",
222
+ "title": "Document 2 Title"
223
+ }
224
+ ]
225
+
226
+ Note:
227
+ - Results are paginated in batches of 5000 records
228
+ - Each batch is processed into clean dictionaries before being yielded
229
+ - The iterator will stop when either:
230
+ 1. No more rows are returned from the query
231
+ 2. The total count of records has been reached
178
232
"""
179
233
180
234
if not source :
181
235
source = self ._get_source_name ()
182
236
183
237
if (index := self .config .get ("index" )) is None :
184
- raise ValueError ("Index not defined for this server" )
238
+ raise ValueError (
239
+ f"Configuration error: Index not defined for server '{ self .server_name } '. "
240
+ "Please update server configuration with the required index."
241
+ )
185
242
186
243
sql = f"SELECT url1, text, title FROM { index } WHERE collection = '/{ source } /{ collection_config_folder } /'"
187
- full_text_response = self .sql_query (sql )
188
- return self ._process_full_text_response (full_text_response )
244
+
245
+ page = 0
246
+ page_size = 5000
247
+ total_processed = 0
248
+
249
+ while True :
250
+ paginated_sql = f"{ sql } SKIP { total_processed } COUNT { page_size } "
251
+ response = self ._execute_sql_query (paginated_sql )
252
+
253
+ rows = response .get ("Rows" , [])
254
+ if not rows : # Stop if we get an empty batch
255
+ break
256
+
257
+ yield self ._process_rows_to_records (rows )
258
+
259
+ total_processed += len (rows )
260
+ total_count = response .get ("TotalRowCount" , 0 )
261
+
262
+ if total_processed >= total_count : # Stop if we've processed all records
263
+ break
264
+
265
+ page += 1
189
266
190
267
@staticmethod
191
- def _process_full_text_response (full_text_response : str ):
192
- return [
193
- {"url" : url , "full_text" : full_text , "title" : title } for url , full_text , title in full_text_response ["Rows" ]
194
- ]
268
+ def _process_full_text_response (batch_data : dict ):
269
+ if "Rows" not in batch_data or not isinstance (batch_data ["Rows" ], list ):
270
+ raise ValueError (
271
+ "Invalid response format: Expected 'Rows' key with list data in Sinequa server response. "
272
+ f"Received: { type (batch_data .get ('Rows' , None ))} "
273
+ )
274
+
275
+ processed_data = []
276
+ for idx , row in enumerate (batch_data ["Rows" ]):
277
+ if len (row ) != 3 :
278
+ raise ValueError (
279
+ f"Invalid row format at index { idx } : Expected exactly three elements (url, full_text, title). "
280
+ f"Received { len (row )} elements."
281
+ )
282
+ url , full_text , title = row
283
+ processed_data .append ({"url" : url , "full_text" : full_text , "title" : title })
284
+ return processed_data
0 commit comments