Skip to content

Commit f7e41c3

Browse files
authored
Merge pull request #7 from Kode-Rex/search
Search added in
2 parents eabc68f + 8945d09 commit f7e41c3

File tree

8 files changed

+422
-27
lines changed

8 files changed

+422
-27
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Using the `@Web Cat` GPT enhances ideation by seamlessly integrating web content
99
## Features
1010
- **Content Extraction**: Utilizes the readability library for clean text extraction.
1111
- **Text Processing**: Further processes extracted content for improved usability.
12+
- **Search Functionality**: Integrates with Serper.dev to provide web search capabilities.
1213

1314
## Getting Started
1415

@@ -40,4 +41,6 @@ cd src
4041
func start
4142
curl -X POST http://localhost:7071/api/scrape -H "Content-Type: application/json" -d "{\"url\":\"https://example.com\"}" # text only
4243
curl -X POST http://localhost:7071/api/scrape_with_images -H "Content-Type: application/json" -d "{\"url\":\"https://bigmedium.com/speaking/sentient-design-josh-clark-talk.html\"}" #text and images
44+
curl -X POST http://localhost:7071/api/set_api_key -H "Content-Type: application/json" -d "{\"api_key\":\"your_serper_api_key\"}" # set Serper API key
45+
curl -X POST http://localhost:7071/api/search -H "Content-Type: application/json" -d "{\"query\":\"your search query\"}" # search and get content
4346
```

TODO.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# WebCat Project TODOs
2+
3+
## Completed
4+
- [x] Set up proper virtual environment for Azure Functions
5+
- [x] Install required dependencies in Python 3.11 (compatible with Azure Functions runtime)
6+
- [x] Fix readability module import issue
7+
- [x] Fix text encoding/decoding in the content scraping
8+
- [x] Add clean_text function to normalize and format scraped text
9+
- [x] Remove 12ft.io fallback from scraping functions
10+
- [x] Create search endpoint using Serper.dev API
11+
- [x] Add API key management functionality
12+
- [x] Add result count in search response
13+
- [x] Integrate with Serper.dev search API to provide search functionality
14+
- [x] Develop a search endpoint that fetches and processes search results
15+
16+
## Todo
17+
- [ ] Add error handling for different HTTP status codes from source websites
18+
- [ ] Implement caching mechanism for frequently accessed content
19+
- [ ] Add rate limiting for API endpoints
20+
- [ ] Create documentation for all API endpoints
21+
- [ ] Add tests for each function
22+
- [ ] Implement logging improvements for better debugging
23+
- [ ] Add configuration options for search parameters (country, language)
24+
- [ ] Create a frontend interface for the API
25+
- [ ] Create appropriate rate limiting and caching strategies for search API
26+
- [ ] Add authentication for search functionality
27+
- [ ] Update documentation with search API usage examples

src/function_app.py

Lines changed: 150 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
import json
22
import azure.functions as func
33
import logging
4-
from readability.readability import Document
4+
from readability import Document
55
import requests
66
from bs4 import BeautifulSoup
77
import random
88
import time
99
from urllib.parse import urlparse
10+
import os
11+
from dotenv import load_dotenv
12+
13+
# Load environment variables from .env file
14+
load_dotenv()
15+
16+
# In-memory API key (fallback if environment variable is not set)
17+
SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
1018

1119
app = func.FunctionApp()
1220

@@ -19,11 +27,22 @@
1927

2028
def fetch_content(url, headers):
2129
response = requests.get(url, headers=headers)
22-
doc = Document(response.content)
30+
html_content = response.content.decode(response.encoding or 'utf-8')
31+
doc = Document(html_content)
2332
summary_html = doc.summary(html_partial=True)
2433
soup = BeautifulSoup(summary_html, 'html.parser')
2534
return soup
2635

36+
def clean_text(text):
37+
"""Clean scraped text by normalizing whitespace and newlines"""
38+
if not text:
39+
return ""
40+
# Replace multiple newlines with a single newline
41+
text = '\n'.join(line.strip() for line in text.splitlines() if line.strip())
42+
# Replace multiple spaces with a single space
43+
text = ' '.join(text.split())
44+
return text
45+
2746
def try_fetch_with_backoff(url, headers, attempts=3, backoff_factor=2):
2847
for attempt in range(attempts):
2948
try:
@@ -52,16 +71,11 @@ def scrape(req: func.HttpRequest) -> func.HttpResponse:
5271
try:
5372
soup = try_fetch_with_backoff(url, headers)
5473
except Exception as e:
55-
logging.error(f"Initial requests failed: {str(e)}")
56-
proxy_url = f"https://12ft.io/{url}"
57-
logging.info(f"Retrying with proxy: {proxy_url}")
58-
try:
59-
soup = try_fetch_with_backoff(proxy_url, headers)
60-
except Exception as e:
61-
logging.error(f"Proxy requests failed: {str(e)}")
62-
return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
74+
logging.error(f"Requests failed: {str(e)}")
75+
return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
6376

64-
content = soup.get_text(separator='\n').strip()
77+
raw_content = soup.get_text(separator=' ').strip()
78+
content = clean_text(raw_content)
6579

6680
return func.HttpResponse(content, mimetype="text/plain")
6781
except Exception as e:
@@ -83,31 +97,141 @@ def scrape_with_images(req: func.HttpRequest) -> func.HttpResponse:
8397
try:
8498
soup = try_fetch_with_backoff(url, headers)
8599
except Exception as e:
86-
logging.error(f"Initial requests failed: {str(e)}")
87-
proxy_url = f"https://12ft.io/{url}"
88-
logging.info(f"Retrying with proxy: {proxy_url}")
89-
try:
90-
soup = try_fetch_with_backoff(proxy_url, headers)
91-
except Exception as e:
92-
logging.error(f"Proxy requests failed: {str(e)}")
93-
return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
100+
logging.error(f"Requests failed: {str(e)}")
101+
return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
94102

95-
content = ''
103+
text_parts = []
104+
images = []
105+
106+
# Extract text and images separately
96107
for element in soup.descendants:
97-
if isinstance(element, str):
98-
content += element.strip() + '\n'
108+
if isinstance(element, str) and element.strip():
109+
text_parts.append(element.strip())
99110
elif element.name == 'img':
100111
img_url = element.get('src')
101112
if img_url and img_url.startswith(('http://', 'https://')):
102-
content += f'\n{img_url}\n'
103-
104-
content = content.strip()
113+
images.append(img_url)
114+
115+
# Clean and join text
116+
text_content = clean_text(' '.join(text_parts))
117+
118+
# Add images after the text
119+
content = text_content
120+
for img_url in images:
121+
content += f'\n\n{img_url}'
105122

106123
response_data = {
107124
"content": content
108125
}
109126

110-
return func.HttpResponse(json.dumps(response_data), mimetype="application/json")
127+
return func.HttpResponse(
128+
json.dumps(response_data, ensure_ascii=False),
129+
mimetype="application/json"
130+
)
111131
except Exception as e:
112132
logging.error(f"Error: {str(e)}")
113133
return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
134+
135+
@app.route(route="set_api_key", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
136+
def set_api_key(req: func.HttpRequest) -> func.HttpResponse:
137+
try:
138+
global SERPER_API_KEY
139+
data = req.get_json()
140+
api_key = data.get('api_key')
141+
142+
if not api_key:
143+
return func.HttpResponse("Error: Missing API key", status_code=400)
144+
145+
# Set the API key in memory
146+
SERPER_API_KEY = api_key
147+
148+
return func.HttpResponse("API key set successfully", status_code=200)
149+
except Exception as e:
150+
logging.error(f"Error setting API key: {str(e)}")
151+
return func.HttpResponse(f"Error: {str(e)}", status_code=500)
152+
153+
@app.route(route="search", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
154+
def search(req: func.HttpRequest) -> func.HttpResponse:
155+
try:
156+
global SERPER_API_KEY
157+
data = req.get_json()
158+
query = data.get('query')
159+
160+
# Use the in-memory API key, or allow overriding with a request parameter
161+
api_key = data.get('api_key') or SERPER_API_KEY
162+
163+
if not query:
164+
return func.HttpResponse("Error: Missing search query", status_code=400)
165+
166+
if not api_key:
167+
return func.HttpResponse("Error: Serper API key not configured. Please use the /api/set_api_key endpoint first.", status_code=400)
168+
169+
logging.info(f'search [{query}]')
170+
171+
# Call Serper.dev API to get search results
172+
serper_url = "https://google.serper.dev/search"
173+
headers = {
174+
'X-API-KEY': api_key,
175+
'Content-Type': 'application/json'
176+
}
177+
payload = {
178+
'q': query,
179+
'gl': 'us',
180+
'hl': 'en'
181+
}
182+
183+
response = requests.post(serper_url, headers=headers, json=payload)
184+
search_results = response.json()
185+
186+
# Extract top 3 organic results
187+
if 'organic' not in search_results or not search_results['organic']:
188+
return func.HttpResponse("No search results found", status_code=404)
189+
190+
top_results = search_results['organic'][:3]
191+
results_with_content = []
192+
193+
user_agent = random.choice(USER_AGENTS)
194+
headers = {'User-Agent': user_agent}
195+
196+
# Scrape content for each result
197+
for result in top_results:
198+
url = result.get('link')
199+
if not url:
200+
continue
201+
202+
try:
203+
# Use the existing scrape functionality
204+
soup = try_fetch_with_backoff(url, headers)
205+
# Get text with space separator to avoid literal \n characters
206+
raw_content = soup.get_text(separator=' ').strip()
207+
# Clean and format the content
208+
content = clean_text(raw_content)
209+
210+
results_with_content.append({
211+
'title': result.get('title'),
212+
'url': url,
213+
'snippet': result.get('snippet'),
214+
'content': content
215+
})
216+
except Exception as e:
217+
logging.error(f"Failed to scrape {url}: {str(e)}")
218+
results_with_content.append({
219+
'title': result.get('title'),
220+
'url': url,
221+
'snippet': result.get('snippet'),
222+
'content': f"Error: Failed to scrape content - {str(e)}"
223+
})
224+
225+
# Include the query and result count in the response
226+
return func.HttpResponse(
227+
json.dumps({
228+
"query": query,
229+
"result_count": len(results_with_content),
230+
"results": results_with_content
231+
}, ensure_ascii=False),
232+
mimetype="application/json"
233+
)
234+
235+
except Exception as e:
236+
logging.error(f"Search error: {str(e)}")
237+
return func.HttpResponse(f"Error: {str(e)}", status_code=500)

src/local.settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"Values": {
44
"FUNCTIONS_WORKER_RUNTIME": "python",
55
"AzureWebJobsFeatureFlags": "EnableWorkerIndexing",
6-
"AzureWebJobsStorage": ""
6+
"AzureWebJobsStorage": "",
7+
"PYTHON_ISOLATE_WORKER_DEPENDENCIES": "true"
78
}
89
}

src/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,6 @@ requests
77
beautifulsoup4
88
readability-lxml
99
lxml_html_clean
10+
python-dotenv
11+
pytest
1012

src/run_tests.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
# Script to run all WebCat tests
3+
# This script activates the virtual environment and runs the tests
4+
5+
# Move to the src directory
6+
cd "$(dirname "$0")"
7+
8+
# Activate virtual environment if it exists
9+
if [ -d ".venv" ]; then
10+
echo "Activating virtual environment..."
11+
source .venv/bin/activate
12+
else
13+
echo "Error: Virtual environment not found in src/.venv"
14+
echo "Please create a virtual environment and install dependencies first."
15+
exit 1
16+
fi
17+
18+
# Get the absolute path to the src directory
19+
SRC_PATH=$(pwd)
20+
21+
# Run the tests with proper Python path
22+
echo "Running WebCat tests..."
23+
cd ..
24+
PYTHONPATH="$SRC_PATH:$PYTHONPATH" python -m pytest -v tests/
25+
26+
# Return the exit code from the test command
27+
exit $?

tests/run_all_tests.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python
2+
"""
3+
Script to run all tests for the WebCat project.
4+
This script ensures the proper import paths are set up before running tests.
5+
"""
6+
import os
7+
import sys
8+
import pytest
9+
10+
# Add the src directory to the path so tests can import function_app
11+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "src"))
12+
13+
def main():
14+
"""Run all tests in the tests directory."""
15+
print("Running WebCat tests...")
16+
# Run pytest with verbose output
17+
exit_code = pytest.main(["-v", os.path.dirname(os.path.abspath(__file__))])
18+
19+
if exit_code == 0:
20+
print("All tests passed successfully!")
21+
else:
22+
print(f"Tests completed with exit code: {exit_code}")
23+
24+
return exit_code
25+
26+
if __name__ == "__main__":
27+
sys.exit(main())

0 commit comments

Comments
 (0)