Skip to content

Commit 03c7a8f

Browse files
committed
change to a cleaner output for api.get_full_texts
1 parent 9139328 commit 03c7a8f

File tree

2 files changed

+21
-19
lines changed

2 files changed

+21
-19
lines changed

sde_collections/sinequa_api.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -162,26 +162,23 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
162162
Retrieves the full texts, URLs, and titles for a specified collection.
163163
164164
Returns:
165-
dict: A JSON response containing the results of the SQL query in an expected format under the 'Rows' key,
166-
where each item has 'url1', 'text', and 'title' .
165+
dict: A JSON response containing the results of the SQL query,
166+
where each item has 'url', 'text', and 'title'.
167167
168168
Example:
169169
Calling get_full_texts("example_collection") might return:
170-
{
171-
'Rows': [
170+
[
172171
{
173-
'url1': 'http://example.com/article1',
172+
'url': 'http://example.com/article1',
174173
'text': 'Here is the full text of the first article...',
175174
'title': 'Article One Title'
176175
},
177176
{
178-
'url1': 'http://example.com/article2',
177+
'url': 'http://example.com/article2',
179178
'text': 'Here is the full text of the second article...',
180179
'title': 'Article Two Title'
181180
}
182181
]
183-
}
184-
185182
"""
186183

187184
if not source:
@@ -191,4 +188,11 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
191188
raise ValueError("Index not defined for this server")
192189

193190
sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
194-
return self.sql_query(sql)
191+
full_text_response = self.sql_query(sql)
192+
return self._process_full_text_response(full_text_response)
193+
194+
@staticmethod
195+
def _process_full_text_response(full_text_response: str):
196+
return [
197+
{"url": url, "full_text": full_text, "title": title} for url, full_text, title in full_text_response["Rows"]
198+
]

sde_collections/tasks.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -160,18 +160,16 @@ def fetch_and_update_full_text(collection_id, server_name):
160160
"""
161161
collection = Collection.objects.get(id=collection_id)
162162
api = Api(server_name)
163-
full_texts = api.get_full_texts(collection.config_folder)
163+
documents = api.get_full_texts(collection.config_folder)
164164

165-
records = full_texts.get("Rows", [])
166-
if not records:
167-
return "No records found in the response."
168-
169-
for record in records:
170-
url, full_text, title = record
171-
if not (url and full_text and title):
165+
for doc in documents:
166+
# if all values are not present, then it is skipped?
167+
if not (doc["url"] and doc["full_text"] and doc["title"]):
172168
continue
173169

174170
CandidateURL.objects.update_or_create(
175-
url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
171+
url=doc["url"],
172+
collection=collection,
173+
defaults={"scraped_text": doc["full_text"], "scraped_title": doc["title"]},
176174
)
177-
return f"Successfully processed {len(records)} records and updated the database."
175+
return f"Successfully processed {len(documents)} records and updated the database."

0 commit comments

Comments
 (0)