Skip to content

Commit 52ab3a5

Browse files
authored
Merge pull request #1077 from NASA-IMPACT/1071-retrieve-full-texts-from-sinequa-dev-servers
Retrieve Full-Texts from Sinequa Dev Servers
2 parents 88eeb0d + e74dfdc commit 52ab3a5

File tree

8 files changed

+244
-42
lines changed

8 files changed

+244
-42
lines changed

.envs/.local/.django

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,17 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
3333
# Slack Webhook
3434
# ------------------------------------------------------------------------------
3535
SLACK_WEBHOOK_URL=''
36-
LRM_USER=''
37-
LRM_PASSWORD=''
36+
37+
#Server Credentials
38+
#--------------------------------------------------------------------------------
39+
LRM_DEV_USER=''
40+
LRM_DEV_PASSWORD=''
3841
XLI_USER=''
3942
XLI_PASSWORD=''
4043
LRM_QA_USER=''
4144
LRM_QA_PASSWORD=''
45+
46+
#Server Tokens
47+
#--------------------------------------------------------------------------------
48+
LRM_DEV_TOKEN=''
49+
XLI_TOKEN=''

config/settings/base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,9 @@
343343
SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL")
344344
XLI_USER = env("XLI_USER")
345345
XLI_PASSWORD = env("XLI_PASSWORD")
346-
LRM_USER = env("LRM_USER")
347-
LRM_PASSWORD = env("LRM_PASSWORD")
346+
LRM_DEV_USER = env("LRM_DEV_USER")
347+
LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD")
348348
LRM_QA_USER = env("LRM_QA_USER")
349349
LRM_QA_PASSWORD = env("LRM_QA_PASSWORD")
350+
LRM_DEV_TOKEN = env("LRM_DEV_TOKEN")
351+
XLI_TOKEN = env("XLI_TOKEN")

sde_collections/admin.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,23 @@
66
from .models.candidate_url import CandidateURL, ResolvedTitle
77
from .models.collection import Collection, WorkflowHistory
88
from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
9-
from .tasks import import_candidate_urls_from_api
9+
from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api
10+
11+
12+
def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
13+
for collection in queryset:
14+
fetch_and_update_full_text.delay(collection.id, server_name)
15+
modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")
16+
17+
18+
@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
19+
def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
20+
fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")
21+
22+
23+
@admin.action(description="Import candidate URLs from XLI Server with Full Text")
24+
def fetch_full_text_lis_action(modeladmin, request, queryset):
25+
fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")
1026

1127

1228
@admin.action(description="Generate deployment message")
@@ -109,7 +125,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
109125
messages.add_message(
110126
request,
111127
messages.INFO,
112-
f"Started importing URLs from the API for: {collection_names} from {server_name.title()}",
128+
f"Started importing URLs from the API for: {collection_names} from {server_name.upper()} Server",
113129
)
114130

115131

@@ -133,19 +149,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset):
133149
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production")
134150

135151

136-
@admin.action(description="Import candidate URLs from Li's Server")
137-
def import_candidate_urls_lis_server(modeladmin, request, queryset):
138-
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server")
152+
@admin.action(description="Import candidate URLs from XLI Server")
153+
def import_candidate_urls_xli_server(modeladmin, request, queryset):
154+
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli")
139155

140156

141157
@admin.action(description="Import candidate URLs from LRM Dev Server")
142158
def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset):
143-
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server")
159+
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev")
144160

145161

146162
@admin.action(description="Import candidate URLs from LRM QA Server")
147163
def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset):
148-
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server")
164+
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa")
149165

150166

151167
class ExportCsvMixin:
@@ -236,9 +252,11 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
236252
import_candidate_urls_production,
237253
import_candidate_urls_secret_test,
238254
import_candidate_urls_secret_production,
239-
import_candidate_urls_lis_server,
255+
import_candidate_urls_xli_server,
240256
import_candidate_urls_lrm_dev_server,
241257
import_candidate_urls_lrm_qa_server,
258+
fetch_full_text_lrm_dev_action,
259+
fetch_full_text_lis_action,
242260
]
243261
ordering = ("cleaning_order",)
244262

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.9 on 2024-10-21 23:10
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="candidateurl",
15+
name="scraped_text",
16+
field=models.TextField(blank=True, null=True),
17+
),
18+
]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Generated by Django 4.2.9 on 2024-11-07 17:34
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("sde_collections", "0059_candidateurl_scraped_text"),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name="candidateurl",
15+
name="scraped_text",
16+
field=models.TextField(
17+
blank=True,
18+
default="",
19+
help_text="This is the text scraped by Sinequa",
20+
null=True,
21+
verbose_name="Scraped Text",
22+
),
23+
),
24+
]

sde_collections/models/candidate_url.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ class CandidateURL(models.Model):
3535
blank=True,
3636
help_text="This is the original title scraped by Sinequa",
3737
)
38+
scraped_text = models.TextField(
39+
"Scraped Text",
40+
default="",
41+
null=True,
42+
blank=True,
43+
help_text="This is the text scraped by Sinequa",
44+
)
3845
generated_title = models.CharField(
3946
"Generated Title",
4047
default="",

sde_collections/sinequa_api.py

Lines changed: 123 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from typing import Any
23

34
import requests
@@ -16,33 +17,39 @@
1617
"app_name": "nasa-sba-smd",
1718
"query_name": "query-smd-primary",
1819
"base_url": "https://sciencediscoveryengine.test.nasa.gov",
20+
"index": "sde_index",
1921
},
2022
"production": {
2123
"app_name": "nasa-sba-smd",
2224
"query_name": "query-smd-primary",
2325
"base_url": "https://sciencediscoveryengine.nasa.gov",
26+
"index": "sde_index",
2427
},
2528
"secret_test": {
2629
"app_name": "nasa-sba-sde",
2730
"query_name": "query-sde-primary",
2831
"base_url": "https://sciencediscoveryengine.test.nasa.gov",
32+
"index": "sde_index",
2933
},
3034
"secret_production": {
3135
"app_name": "nasa-sba-sde",
3236
"query_name": "query-sde-primary",
3337
"base_url": "https://sciencediscoveryengine.nasa.gov",
38+
"index": "sde_index",
3439
},
35-
"lis_server": {
40+
"xli": {
3641
"app_name": "nasa-sba-smd",
3742
"query_name": "query-smd-primary",
3843
"base_url": "http://sde-xli.nasa-impact.net",
44+
"index": "sde_index",
3945
},
40-
"lrm_dev_server": {
46+
"lrm_dev": {
4147
"app_name": "sde-init-check",
4248
"query_name": "query-init-check",
4349
"base_url": "https://sde-lrm.nasa-impact.net",
50+
"index": "sde_init_check",
4451
},
45-
"lrm_qa_server": {
52+
"lrm_qa": {
4653
"app_name": "sde-init-check",
4754
"query_name": "query-init-check",
4855
"base_url": "https://sde-qa.nasa-impact.net",
@@ -51,37 +58,61 @@
5158

5259

5360
class Api:
54-
def __init__(self, server_name: str) -> None:
61+
def __init__(self, server_name: str = None, user: str = None, password: str = None, token: str = None) -> None:
5562
self.server_name = server_name
56-
self.app_name: str = server_configs[server_name]["app_name"]
57-
self.query_name: str = server_configs[server_name]["query_name"]
58-
self.base_url: str = server_configs[server_name]["base_url"]
59-
self.xli_user = settings.XLI_USER
60-
self.xli_password = settings.XLI_PASSWORD
61-
self.lrm_user = settings.LRM_USER
62-
self.lrm_password = settings.LRM_PASSWORD
63-
self.lrm_qa_user = settings.LRM_QA_USER
64-
self.lrm_qa_password = settings.LRM_QA_PASSWORD
63+
if server_name not in server_configs:
64+
raise ValueError(f"Server name '{server_name}' is not in server_configs")
65+
66+
self.config = server_configs[server_name]
67+
self.app_name: str = self.config["app_name"]
68+
self.query_name: str = self.config["query_name"]
69+
self.base_url: str = self.config["base_url"]
70+
self.dev_servers = ["xli", "lrm_dev", "lrm_qa"]
71+
72+
# Store provided values only
73+
self._provided_user = user
74+
self._provided_password = password
75+
self._provided_token = token
76+
77+
def _get_user(self) -> str | None:
78+
"""Retrieve the user, using the provided value or defaulting to Django settings."""
79+
return self._provided_user or getattr(settings, f"{self.server_name}_USER".upper(), None)
80+
81+
def _get_password(self) -> str | None:
82+
"""Retrieve the password, using the provided value or defaulting to Django settings."""
83+
return self._provided_password or getattr(settings, f"{self.server_name}_PASSWORD".upper(), None)
84+
85+
def _get_token(self) -> str | None:
86+
"""Retrieve the token, using the provided value or defaulting to Django settings."""
87+
return self._provided_token or getattr(settings, f"{self.server_name}_TOKEN".upper(), None)
88+
89+
def _get_source_name(self) -> str:
90+
"""by default, the source is /SDE/. However for the various dev servers, the source is tends to be /scrapers/"""
91+
return "scrapers" if self.server_name in self.dev_servers else "SDE"
6592

6693
def process_response(self, url: str, payload: dict[str, Any]) -> Any:
6794
response = requests.post(url, headers={}, json=payload, verify=False)
6895

69-
if response.status_code == requests.status_codes.codes.ok:
70-
meaningful_response = response.json()
96+
if response.status_code == requests.codes.ok:
97+
return response.json()
7198
else:
72-
raise Exception(response.text)
99+
response.raise_for_status()
73100

74-
return meaningful_response
101+
def query(self, page: int, collection_config_folder: str = None, source: str = None) -> Any:
102+
url = f"{self.base_url}/api/v1/search.query"
103+
if self.server_name in self.dev_servers:
104+
user = self._get_user()
105+
password = self._get_password()
75106

76-
def query(self, page: int, collection_config_folder: str = "") -> Any:
77-
if self.server_name == "lis_server":
78-
url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}"
79-
elif self.server_name == "lrm_dev_server":
80-
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}"
81-
elif self.server_name == "lrm_qa_server":
82-
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}"
107+
if not user or not password:
108+
raise ValueError(
109+
"User and password are required for the query endpoint on the following servers: {self.dev_servers}"
110+
)
111+
authentication = f"?Password={password}&User={user}"
112+
url = f"{url}{authentication}"
83113
else:
84114
url = f"{self.base_url}/api/v1/search.query"
115+
85116
payload = {
86117
"app": self.app_name,
87118
"query": {
@@ -94,11 +125,73 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
94125
}
95126

96127
if collection_config_folder:
97-
if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]:
98-
payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
99-
else:
100-
payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"
128+
source = source if source else self._get_source_name()
129+
payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/"
130+
131+
return self.process_response(url, payload)
132+
133+
def sql_query(self, sql: str) -> Any:
134+
"""Executes an SQL query on the configured server using token-based authentication."""
135+
token = self._get_token()
136+
if not token:
137+
raise ValueError("A token is required to use the SQL endpoint")
138+
url = f"{self.base_url}/api/v1/engine.sql"
139+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
140+
payload = json.dumps(
141+
{
142+
"method": "engine.sql",
143+
"sql": sql,
144+
"pretty": True,
145+
"log": False,
146+
"output": "json",
147+
"resolveIndexList": "false",
148+
"engines": "default",
149+
}
150+
)
151+
152+
try:
153+
response = requests.post(url, headers=headers, data=payload, timeout=10)
154+
response.raise_for_status()
155+
return response.json()
156+
except requests.exceptions.RequestException as e:
157+
raise RuntimeError(f"Api request to SQL endpoint failed: {str(e)}")
158+
159+
def get_full_texts(self, collection_config_folder: str, source: str = None) -> Any:
160+
"""
161+
Retrieves the full texts, URLs, and titles for a specified collection.
162+
163+
Returns:
164+
dict: A JSON response containing the results of the SQL query,
165+
where each item has 'url', 'text', and 'title'.
166+
167+
Example:
168+
Calling get_full_texts("example_collection") might return:
169+
[
170+
{
171+
'url': 'http://example.com/article1',
172+
'text': 'Here is the full text of the first article...',
173+
'title': 'Article One Title'
174+
},
175+
{
176+
'url': 'http://example.com/article2',
177+
'text': 'Here is the full text of the second article...',
178+
'title': 'Article Two Title'
179+
}
180+
]
181+
"""
182+
183+
if not source:
184+
source = self._get_source_name()
185+
186+
if (index := self.config.get("index")) is None:
187+
raise ValueError("Index not defined for this server")
101188

102-
response = self.process_response(url, payload)
189+
sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
190+
full_text_response = self.sql_query(sql)
191+
return self._process_full_text_response(full_text_response)
103192

104-
return response
193+
@staticmethod
194+
def _process_full_text_response(full_text_response: str):
195+
return [
196+
{"url": url, "full_text": full_text, "title": title} for url, full_text, title in full_text_response["Rows"]
197+
]

0 commit comments

Comments
 (0)