Skip to content

Commit 792b66e

Browse files
committed
feat (custom-rag): added url rag functionality
1 parent 74ca3bd commit 792b66e

File tree

8 files changed

+247
-8
lines changed

8 files changed

+247
-8
lines changed

src/interface/app/chat/page.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1017,4 +1017,4 @@ const Chat = () => {
10171017
)
10181018
}
10191019

1020-
export default Chat
1020+
export default Chat

src/model/app/app.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,7 +1265,26 @@ async def custom_rag_endpoint(request: CustomRAGRequest) -> JSONResponse:
12651265
"MEMORY_SERVER_PORT", "/custom-rag", payload
12661266
) # Fetch and return streaming response
12671267

1268+
@app.post("/temp-rag")
1269+
async def custom_rag_endpoint(request: CustomRAGRequest) -> JSONResponse:
1270+
"""
1271+
Endpoint to proxy TempRAG requests to the Memory Service.
1272+
1273+
Forwards requests for text based retrieval-augmented generation to the Memory Service and returns its streaming response.
12681274
1275+
Args:
1276+
request (TempRAGRequest): Request body containing the query for TempRAG.
1277+
1278+
Returns:
1279+
StreamingResponse: Streaming response from the Memory Service's custom-rag endpoint.
1280+
"""
1281+
payload: Dict[str, str] = (
1282+
request.model_dump()
1283+
) # Extract payload from GraphRAGRequest model
1284+
return await call_service_endpoint(
1285+
"MEMORY_SERVER_PORT", "/temp-rag", payload
1286+
) # Fetch and return streaming response
1287+
12691288
@app.post("/chat", status_code=200)
12701289
async def chat(message: Message) -> StreamingResponse:
12711290
"""

src/model/chat/chat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ async def response_generator() -> AsyncGenerator[str, None]:
153153
user_context = None
154154
internet_context = None
155155
rag_context = None
156+
url_context = None
156157
pro_used = False
157158
note = ""
158159

@@ -221,6 +222,8 @@ async def response_generator() -> AsyncGenerator[str, None]:
221222
}) + "\n")
222223
await asyncio.sleep(0.05)
223224
rag_context = await get_rag_context()
225+
226+
url_context = await get_url_context(message.original_input)
224227

225228
personality_description = db["userData"].get("personality", "None")
226229

@@ -232,6 +235,7 @@ async def response_generator() -> AsyncGenerator[str, None]:
232235
"user_context": user_context,
233236
"internet_context": internet_context,
234237
"rag_context": rag_context,
238+
"url_context": url_context,
235239
"name": username,
236240
"personality": personality_description,
237241
},

src/model/chat/externals.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,35 @@ async def get_rag_context(query:str) -> Dict[str, Any]:
155155
print(f"Error fetching RAG context: {e}")
156156
return {
157157
"error": f"Error calling customrag: {str(e)}"
158-
}
158+
}
159+
160+
async def get_url_context(query: str) -> str:
161+
"""
162+
Asynchronously calls the proxy endpoint to get extracted data from URLs in the query.
163+
164+
Args:
165+
query (str): The user's query, which may contain URLs.
166+
167+
Returns:
168+
str: Extracted data from URLs in the format 'Source: <url>\nInformation: <content>',
169+
or an error message if the request fails.
170+
"""
171+
# Extract URLs from the query
172+
url_data = extract_and_classify_urls(query)
173+
website_urls = url_data['website_urls']
174+
youtube_urls = url_data['youtube_urls']
175+
176+
try:
177+
port = os.environ.get("APP_SERVER_PORT", "5000")
178+
async with httpx.AsyncClient(timeout=None) as client:
179+
response = await client.post(
180+
f"http://localhost:{port}/url-rag",
181+
json={"query": query, "website_urls": website_urls, "youtube_urls": youtube_urls}
182+
)
183+
if response.status_code == 200:
184+
return response.json()["context"]
185+
else:
186+
return f"Error fetching RAG context: {response.text}"
187+
except Exception as e:
188+
print(f"Error fetching RAG context: {e}")
189+
return f"Error calling custom-rag: {str(e)}"

src/model/chat/helpers.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,4 +251,31 @@ def check_uploaded_files() -> bool:
251251
upload_dir = "../../interface/uploads" # Replace with actual path
252252
if not os.path.exists(upload_dir):
253253
return False
254-
return len(os.listdir(upload_dir)) > 0
254+
return len(os.listdir(upload_dir)) > 0
255+
256+
def extract_and_classify_urls(message: str) -> dict:
257+
"""
258+
Extracts URLs from the message and classifies them as website or YouTube URLs.
259+
260+
Args:
261+
message (str): The user's message.
262+
263+
Returns:
264+
dict: A dictionary with keys 'website_urls' and 'youtube_urls', each containing a list of URLs.
265+
"""
266+
url_pattern = r'(https?://[^\s]+)'
267+
urls = re.findall(url_pattern, message)
268+
269+
website_urls = []
270+
youtube_urls = []
271+
272+
for url in urls:
273+
if 'youtube.com' in url or 'youtu.be' in url:
274+
youtube_urls.append(url)
275+
else:
276+
website_urls.append(url)
277+
278+
return {
279+
'website_urls': website_urls,
280+
'youtube_urls': youtube_urls
281+
}

src/model/chat/prompts.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,14 @@
9393
9494
Internet Search Results (USE THIS AS ADDITIONAL CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {internet_context}
9595
96-
RAG Context from Files (USE THIS AS *ADDITIONAL* CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {rag_context}
96+
RAG Context from Files (USE THIS AS ADDITIONAL CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {rag_context}
97+
98+
URL Context (USE THIS AS *ADDITIONAL* CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {url_context}
9799
98100
Username (ONLY CALL THE USER BY THEIR NAME WHEN REQUIRED. YOU DO NOT NEED TO CALL THE USER BY THEIR NAME IN EACH MESSAGE.): {name}
99101
100102
Personality (DO NOT REPEAT THE USER'S PERSONALITY TO THEM, ONLY USE IT TO GENERATE YOUR RESPONSES OR CHANGE YOUR STYLE OF TALKING.): {personality}
101103
102104
CHAT LIKE A HUMAN WOULD. USE LOWERCASE LETTERS AND A CASUAL TONE. AVOID USING PUNCTUATION MARKS LIKE FULL STOPS OR COMMAS. KEEP YOUR RESPONSES SHORT AND ENGAGING.
103105
YOU NEED TO BE THE USER'S FRIEND - NOT AN ASSISTANT.
104-
"""
106+
"""

src/model/memory/functions.py

Lines changed: 134 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from chromadb.utils import embedding_functions
1818
import docx
1919
import PyPDF2
20-
import os
2120
import chromadb
2221
from chromadb.utils import embedding_functions
2322
from typing import List, Tuple, Dict
@@ -34,6 +33,16 @@
3433
import tempfile
3534
from pydub import AudioSegment
3635
from pydub.silence import split_on_silence
36+
import html2text
37+
import yt_dlp
38+
import speech_recognition as sr
39+
import tempfile
40+
import subprocess
41+
from pathlib import Path
42+
import re
43+
import unicodedata
44+
from pydub import AudioSegment
45+
from bs4 import BeautifulSoup
3746

3847
load_dotenv("../.env") # Load environment variables from .env file
3948

@@ -1852,4 +1861,127 @@ def format_context_with_sources(results: dict) -> str:
18521861
formatted_part = f"Source: {source_name}\nInformation: {doc}"
18531862
formatted_parts.append(formatted_part)
18541863

1855-
return "\n\n".join(formatted_parts)
1864+
return "\n\n".join(formatted_parts)
1865+
1866+
1867+
def extract_text_from_url(url: str) -> str:
1868+
"""
1869+
Extracts structured text from a website URL.
1870+
1871+
Args:
1872+
url (str): The website URL.
1873+
1874+
Returns:
1875+
str: Extracted text in markdown format, or an error message if extraction fails.
1876+
"""
1877+
headers = {'User-Agent': 'Mozilla/5.0'}
1878+
try:
1879+
response = requests.get(url, headers=headers, timeout=10)
1880+
response.raise_for_status()
1881+
except requests.RequestException as e:
1882+
return f"Error fetching the webpage: {e}"
1883+
1884+
soup = BeautifulSoup(response.text, 'html.parser')
1885+
for script in soup(['script', 'style', 'meta', 'noscript']):
1886+
script.extract()
1887+
1888+
markdown_converter = html2text.HTML2Text()
1889+
markdown_converter.ignore_links = False
1890+
markdown_converter.ignore_images = True
1891+
markdown_converter.ignore_tables = False
1892+
1893+
structured_text = markdown_converter.handle(str(soup))
1894+
return structured_text.strip()
1895+
1896+
# --- YouTube Transcript Extraction ---
1897+
def sanitize_filename(filename: str) -> str:
1898+
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
1899+
filename = filename.replace(' ', '_')
1900+
filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
1901+
return filename[:200]
1902+
1903+
def check_ffmpeg() -> str:
1904+
try:
1905+
result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
1906+
if result.returncode == 0:
1907+
return 'ffmpeg'
1908+
except FileNotFoundError:
1909+
possible_paths = [
1910+
r'C:\ffmpeg\bin\ffmpeg.exe', r'C:\Program Files\ffmpeg\bin\ffmpeg.exe',
1911+
r'C:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe', str(Path.home() / 'ffmpeg' / 'bin' / 'ffmpeg.exe'),
1912+
'/usr/bin/ffmpeg', '/usr/local/bin/ffmpeg', '/opt/homebrew/bin/ffmpeg',
1913+
str(Path.home() / 'ffmpeg' / 'ffmpeg')
1914+
]
1915+
for path in possible_paths:
1916+
if os.path.isfile(path):
1917+
return path
1918+
raise Exception("FFmpeg not found. Please install it and add it to PATH.")
1919+
1920+
def download_audio(url: str, output_path: str) -> Optional[str]:
1921+
try:
1922+
ffmpeg_path = check_ffmpeg()
1923+
output_template = os.path.join(output_path, '%(title).100s.%(ext)s')
1924+
ydl_opts = {
1925+
'format': 'bestaudio/best',
1926+
'outtmpl': output_template,
1927+
'postprocessors': [{
1928+
'key': 'FFmpegExtractAudio',
1929+
'preferredcodec': 'wav',
1930+
'preferredquality': '192',
1931+
}],
1932+
'ffmpeg_location': os.path.dirname(ffmpeg_path) if os.path.dirname(ffmpeg_path) else None,
1933+
'quiet': True,
1934+
'no_warnings': True,
1935+
'restrictfilenames': True
1936+
}
1937+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
1938+
info = ydl.extract_info(url, download=True)
1939+
safe_title = sanitize_filename(info['title'])
1940+
audio_file = os.path.join(output_path, f"{safe_title}.wav")
1941+
if not os.path.exists(audio_file):
1942+
possible_files = [f for f in os.listdir(output_path) if f.endswith('.wav')]
1943+
if possible_files:
1944+
audio_file = os.path.join(output_path, possible_files[0])
1945+
return audio_file
1946+
except Exception as e:
1947+
print(f"Error downloading audio: {str(e)}")
1948+
return None
1949+
1950+
def transcribe_audio(audio_path: str, language: str = "en-US", chunk_size: int = 60000) -> Optional[str]:
1951+
if not os.path.exists(audio_path):
1952+
print("Audio file not found")
1953+
return None
1954+
recognizer = sr.Recognizer()
1955+
audio = AudioSegment.from_wav(audio_path)
1956+
chunks = [audio[start:start + chunk_size] for start in range(0, len(audio), chunk_size)]
1957+
full_text = []
1958+
for i, chunk in enumerate(chunks):
1959+
temp_fd, temp_wav_path = tempfile.mkstemp(suffix=".wav")
1960+
os.close(temp_fd)
1961+
try:
1962+
chunk.export(temp_wav_path, format="wav")
1963+
with sr.AudioFile(temp_wav_path) as source:
1964+
audio_data = recognizer.listen(source)
1965+
try:
1966+
text = recognizer.recognize_google(audio_data, language=language)
1967+
full_text.append(text)
1968+
except sr.UnknownValueError:
1969+
print(f"Could not understand chunk {i}")
1970+
except sr.RequestError as e:
1971+
print(f"Speech recognition error in chunk {i}: {e}")
1972+
finally:
1973+
os.remove(temp_wav_path)
1974+
return " ".join(full_text).strip()
1975+
1976+
def generate_transcript(url: str) -> Optional[str]:
1977+
try:
1978+
with tempfile.TemporaryDirectory() as temp_dir:
1979+
audio_file = download_audio(url, temp_dir)
1980+
if not audio_file or not os.path.exists(audio_file):
1981+
print("Failed to download audio file")
1982+
return None
1983+
transcript = transcribe_audio(audio_file)
1984+
return transcript if transcript else None
1985+
except Exception as e:
1986+
print(f"Error generating transcript: {str(e)}")
1987+
return None

src/model/memory/memory.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@ class GraphRAGRequest(BaseModel):
9696

9797
class CustomRAGRequest(BaseModel):
9898
query: str
99-
99+
100+
class TempRAGRequest(BaseModel):
101+
website_urls: list
102+
youtube_urls: list
100103

101104
# --- Global Variables for Application State ---
102105
# These global variables store initialized models, runnables, database connections, and chat history.
@@ -904,6 +907,27 @@ async def custom_rag(request: CustomRAGRequest):
904907
except Exception as e:
905908
raise HTTPException(status_code=500, detail=str(e))
906909

910+
@app.post("/url-rag")
911+
async def url_rag(request: TempRAGRequest):
912+
"""
913+
Endpoint to fetch and format content from URLs.
914+
915+
Args:
916+
request (CustomRAGRequest): Request body containing the query and URLs.
917+
918+
Returns:
919+
dict: A dictionary with a 'context' key containing the formatted content.
920+
"""
921+
context = ""
922+
for url in request.website_urls:
923+
text = extract_text_from_url(url)
924+
context += f"Source: {url}\nInformation: {text}\n\n"
925+
for url in request.youtube_urls:
926+
transcript = generate_transcript(url)
927+
if transcript:
928+
context += f"Source: {url}\nInformation: {transcript}\n\n"
929+
return {"context": context.strip()}
930+
907931
# --- Main execution block ---
908932
if __name__ == "__main__":
909933
"""

0 commit comments

Comments
 (0)