Skip to content

Commit 742d52a

Browse files
Add SearchApi integration
1 parent 5951179 commit 742d52a

File tree

4 files changed

+111
-3
lines changed

4 files changed

+111
-3
lines changed

pipelines/examples/agents/react_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282

8383
# yapf: disable
8484
parser = argparse.ArgumentParser()
85-
parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev or SerpAPI key.")
85+
parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev, SerpAPI or SearchApi.io key.")
8686
parser.add_argument('--llm_name', choices=['THUDM/chatglm-6b', "THUDM/chatglm-6b-v1.1", "gpt-3.5-turbo", "gpt-4"], default="THUDM/chatglm-6b-v1.1", help="The chatbot models ")
8787
parser.add_argument("--api_key", default=None, type=str, help="The API Key.")
8888
args = parser.parse_args()

pipelines/examples/agents/react_example_cn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@
6060
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.")
6161
parser.add_argument("--index_name", default='dureader_index', type=str, help="The ann index name of ANN.")
6262
parser.add_argument("--search_engine", choices=['faiss', 'milvus'], default="faiss", help="The type of ANN search engine.")
63-
parser.add_argument("--retriever", choices=['dense', 'SerperDev', 'SerpAPI'], default="dense", help="The type of Retriever.")
63+
parser.add_argument("--retriever", choices=['dense', 'SerperDev', 'SerpAPI', 'SearchApi'], default="dense", help="The type of Retriever.")
6464
parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
6565
parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
6666
parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
6767
parser.add_argument("--query_embedding_model", default="rocketqa-zh-base-query-encoder", type=str, help="The query_embedding_model path")
6868
parser.add_argument("--passage_embedding_model", default="rocketqa-zh-base-query-encoder", type=str, help="The passage_embedding_model path")
6969
parser.add_argument("--params_path", default="checkpoints/model_40/model_state.pdparams", type=str, help="The checkpoint path")
7070
parser.add_argument("--embedding_dim", default=768, type=int, help="The embedding_dim of index")
71-
parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev or SerpAPI key.")
71+
parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev, SerpAPI or SearchApi.io key.")
7272
parser.add_argument('--embed_title', default=False, type=bool, help="The title to be embedded into embedding")
7373
parser.add_argument('--model_type', choices=['ernie_search', 'ernie', 'bert', 'neural_search'], default="ernie", help="the ernie model types")
7474
parser.add_argument('--llm_name', choices=['ernie-bot', 'THUDM/chatglm-6b', "gpt-3.5-turbo", "gpt-4"], default="THUDM/chatglm-6b", help="The chatbot models ")

pipelines/pipelines/nodes/search_engine/providers.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,110 @@ def search(self, query: str, **kwargs) -> List[Document]:
239239
logger.debug("Serper.dev API returned %s documents for the query '%s'", len(documents), query)
240240
result_docs = documents[:top_k]
241241
return self.score_results(result_docs, len(answer_box) > 0)
242+
243+
244+
class SearchApi(SearchEngine):
245+
"""
246+
SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube,
247+
YouTube transcripts and more. See the [SearchApi website](https://www.searchapi.io/) for more details.
248+
"""
249+
250+
def __init__(
251+
self,
252+
api_key: str,
253+
top_k: Optional[int] = 10,
254+
engine: Optional[str] = "google",
255+
search_engine_kwargs: Optional[Dict[str, Any]] = None,
256+
):
257+
"""
258+
:param api_key: API key for SearchApi.
259+
:param top_k: Number of results to return.
260+
:param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
261+
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
262+
:param search_engine_kwargs: Additional parameters passed to the SearchApi.
263+
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
264+
"""
265+
super().__init__()
266+
self.params_dict: Dict[str, Union[str, int, float]] = {}
267+
self.api_key = api_key
268+
self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
269+
self.engine = engine
270+
self.top_k = top_k
271+
272+
def search(self, query: str, **kwargs) -> List[Document]:
273+
"""
274+
:param query: Query string.
275+
:param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
276+
to localize search to the specific location.
277+
:return: List[Document]
278+
"""
279+
kwargs = {**self.kwargs, **kwargs}
280+
top_k = kwargs.pop("top_k", self.top_k)
281+
url = "https://www.searchapi.io/api/v1/search"
282+
283+
params = {"q": query, **kwargs}
284+
headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "PaddleNLP"}
285+
286+
if self.engine:
287+
params["engine"] = self.engine
288+
response = requests.get(url, params=params, headers=headers, timeout=90)
289+
290+
if response.status_code != 200:
291+
raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")
292+
293+
json_content = json.loads(response.text)
294+
documents = []
295+
has_answer_box = False
296+
297+
if json_content.get("answer_box"):
298+
if json_content["answer_box"].get("organic_result"):
299+
title = json_content["answer_box"].get("organic_result").get("title", "")
300+
link = json_content["answer_box"].get("organic_result").get("link", "")
301+
if json_content["answer_box"].get("type") == "population_graph":
302+
title = json_content["answer_box"].get("place", "")
303+
link = json_content["answer_box"].get("explore_more_link", "")
304+
305+
title = json_content["answer_box"].get("title", "")
306+
link = json_content["answer_box"].get("link")
307+
content = json_content["answer_box"].get("answer") or json_content["answer_box"].get("snippet")
308+
309+
if link and content:
310+
has_answer_box = True
311+
documents.append(Document.from_dict({"title": title, "content": content, "link": link}))
312+
313+
if json_content.get("knowledge_graph"):
314+
if json_content["knowledge_graph"].get("source"):
315+
link = json_content["knowledge_graph"].get("source").get("link", "")
316+
317+
link = json_content["knowledge_graph"].get("website", "")
318+
content = json_content["knowledge_graph"].get("description")
319+
320+
if link and content:
321+
documents.append(
322+
Document.from_dict(
323+
{"title": json_content["knowledge_graph"].get("title", ""), "content": content, "link": link}
324+
)
325+
)
326+
327+
documents += [
328+
Document.from_dict({"title": c["title"], "content": c.get("snippet", ""), "link": c["link"]})
329+
for c in json_content["organic_results"]
330+
]
331+
332+
if json_content.get("related_questions"):
333+
for question in json_content["related_questions"]:
334+
if question.get("source"):
335+
link = question.get("source").get("link", "")
336+
else:
337+
link = ""
338+
339+
content = question.get("answer", "")
340+
341+
if link and content:
342+
documents.append(
343+
Document.from_dict({"title": question.get("question", ""), "content": content, "link": link})
344+
)
345+
346+
logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
347+
result_docs = documents[:top_k]
348+
return self.score_results(result_docs, has_answer_box)

pipelines/pipelines/nodes/search_engine/web.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class WebSearch(BaseComponent):
2828
2929
WebSerach currently supports the following search engines providers (bridges):
3030
- SerperDev (default)
31+
- SearchApi
3132
- SerpAPI
3233
- BingAPI
3334

0 commit comments

Comments
 (0)