feat (custom-rag): added url rag functionality

itsskofficial · itsskofficial · commit 792b66e8d7ab · 2025-03-06T17:38:59.000+05:30
diff --git a/src/interface/app/chat/page.js b/src/interface/app/chat/page.js
@@ -1017,4 +1017,4 @@ const Chat = () => {
 	)
 }
 
-export default Chat
+export default Chat
diff --git a/src/model/app/app.py b/src/model/app/app.py
@@ -1265,7 +1265,26 @@ async def custom_rag_endpoint(request: CustomRAGRequest) -> JSONResponse:
         "MEMORY_SERVER_PORT", "/custom-rag", payload
     )  # Fetch and return streaming response
 
+@app.post("/temp-rag")
+async def custom_rag_endpoint(request: CustomRAGRequest) -> JSONResponse:
+    """
+    Endpoint to proxy TempRAG requests to the Memory Service.
+
+    Forwards requests for text based retrieval-augmented generation to the Memory Service and returns its streaming response.
 
+    Args:
+        request (TempRAGRequest): Request body containing the query for TempRAG.
+
+    Returns:
+        StreamingResponse: Streaming response from the Memory Service's custom-rag endpoint.
+    """
+    payload: Dict[str, str] = (
+        request.model_dump()
+    )  # Extract payload from GraphRAGRequest model
+    return await call_service_endpoint(
+        "MEMORY_SERVER_PORT", "/temp-rag", payload
+    )  # Fetch and return streaming response
+    
 @app.post("/chat", status_code=200)
 async def chat(message: Message) -> StreamingResponse:
     """
diff --git a/src/model/chat/chat.py b/src/model/chat/chat.py
@@ -153,6 +153,7 @@ async def response_generator() -> AsyncGenerator[str, None]:
             user_context = None
             internet_context = None
             rag_context = None
+            url_context = None
             pro_used = False
             note = ""
 
@@ -221,6 +222,8 @@ async def response_generator() -> AsyncGenerator[str, None]:
                 }) + "\n")
                 await asyncio.sleep(0.05)
                 rag_context = await get_rag_context()
+                
+            url_context = await get_url_context(message.original_input)
 
             personality_description = db["userData"].get("personality", "None")
 
@@ -232,6 +235,7 @@ async def response_generator() -> AsyncGenerator[str, None]:
                         "user_context": user_context,
                         "internet_context": internet_context,
                         "rag_context": rag_context,
+                        "url_context": url_context,
                         "name": username,
                         "personality": personality_description,
                     },
diff --git a/src/model/chat/externals.py b/src/model/chat/externals.py
@@ -155,4 +155,35 @@ async def get_rag_context(query:str) -> Dict[str, Any]:
         print(f"Error fetching RAG context: {e}")
         return {
             "error": f"Error calling customrag: {str(e)}"
-        }  
+        }  
+        
+async def get_url_context(query: str) -> str:
+    """
+    Asynchronously calls the proxy endpoint to get extracted data from URLs in the query.
+
+    Args:
+        query (str): The user's query, which may contain URLs.
+
+    Returns:
+        str: Extracted data from URLs in the format 'Source: <url>\nInformation: <content>', 
+             or an error message if the request fails.
+    """
+    # Extract URLs from the query
+    url_data = extract_and_classify_urls(query)
+    website_urls = url_data['website_urls']
+    youtube_urls = url_data['youtube_urls']
+
+    try:
+        port = os.environ.get("APP_SERVER_PORT", "5000")
+        async with httpx.AsyncClient(timeout=None) as client:
+            response = await client.post(
+                f"http://localhost:{port}/url-rag",
+                json={"query": query, "website_urls": website_urls, "youtube_urls": youtube_urls}
+            )
+            if response.status_code == 200:
+                return response.json()["context"]
+            else:
+                return f"Error fetching RAG context: {response.text}"
+    except Exception as e:
+        print(f"Error fetching RAG context: {e}")
+        return f"Error calling custom-rag: {str(e)}"
diff --git a/src/model/chat/helpers.py b/src/model/chat/helpers.py
@@ -251,4 +251,31 @@ def check_uploaded_files() -> bool:
     upload_dir = "../../interface/uploads"  # Replace with actual path
     if not os.path.exists(upload_dir):
         return False
-    return len(os.listdir(upload_dir)) > 0
+    return len(os.listdir(upload_dir)) > 0
+
+def extract_and_classify_urls(message: str) -> dict:
+    """
+    Extracts URLs from the message and classifies them as website or YouTube URLs.
+
+    Args:
+        message (str): The user's message.
+
+    Returns:
+        dict: A dictionary with keys 'website_urls' and 'youtube_urls', each containing a list of URLs.
+    """
+    url_pattern = r'(https?://[^\s]+)'
+    urls = re.findall(url_pattern, message)
+    
+    website_urls = []
+    youtube_urls = []
+    
+    for url in urls:
+        if 'youtube.com' in url or 'youtu.be' in url:
+            youtube_urls.append(url)
+        else:
+            website_urls.append(url)
+    
+    return {
+        'website_urls': website_urls,
+        'youtube_urls': youtube_urls
+    }
diff --git a/src/model/chat/prompts.py b/src/model/chat/prompts.py
@@ -93,12 +93,14 @@
 
 Internet Search Results (USE THIS AS ADDITIONAL CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {internet_context}
 
-RAG Context from Files (USE THIS AS *ADDITIONAL* CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {rag_context}
+RAG Context from Files (USE THIS AS ADDITIONAL CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {rag_context}
+
+URL Context (USE THIS AS *ADDITIONAL* CONTEXT TO RESPOND TO THE QUERY, ONLY IF PROVIDED.): {url_context}
 
 Username (ONLY CALL THE USER BY THEIR NAME WHEN REQUIRED. YOU DO NOT NEED TO CALL THE USER BY THEIR NAME IN EACH MESSAGE.): {name}
 
 Personality (DO NOT REPEAT THE USER'S PERSONALITY TO THEM, ONLY USE IT TO GENERATE YOUR RESPONSES OR CHANGE YOUR STYLE OF TALKING.): {personality}
 
 CHAT LIKE A HUMAN WOULD. USE LOWERCASE LETTERS AND A CASUAL TONE. AVOID USING PUNCTUATION MARKS LIKE FULL STOPS OR COMMAS. KEEP YOUR RESPONSES SHORT AND ENGAGING.
 YOU NEED TO BE THE USER'S FRIEND - NOT AN ASSISTANT.
-"""
+"""
diff --git a/src/model/memory/functions.py b/src/model/memory/functions.py
@@ -17,7 +17,6 @@
 from chromadb.utils import embedding_functions
 import docx
 import PyPDF2
-import os
 import chromadb
 from chromadb.utils import embedding_functions
 from typing import List, Tuple, Dict
@@ -34,6 +33,16 @@
 import tempfile
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
+import html2text
+import yt_dlp
+import speech_recognition as sr
+import tempfile
+import subprocess
+from pathlib import Path
+import re
+import unicodedata
+from pydub import AudioSegment
+from bs4 import BeautifulSoup
 
 load_dotenv("../.env")  # Load environment variables from .env file
 
@@ -1852,4 +1861,127 @@ def format_context_with_sources(results: dict) -> str:
         formatted_part = f"Source: {source_name}\nInformation: {doc}"
         formatted_parts.append(formatted_part)
     
-    return "\n\n".join(formatted_parts)
+    return "\n\n".join(formatted_parts)
+
+
+def extract_text_from_url(url: str) -> str:
+    """
+    Extracts structured text from a website URL.
+
+    Args:
+        url (str): The website URL.
+
+    Returns:
+        str: Extracted text in markdown format, or an error message if extraction fails.
+    """
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        return f"Error fetching the webpage: {e}"
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    for script in soup(['script', 'style', 'meta', 'noscript']):
+        script.extract()
+
+    markdown_converter = html2text.HTML2Text()
+    markdown_converter.ignore_links = False
+    markdown_converter.ignore_images = True
+    markdown_converter.ignore_tables = False
+
+    structured_text = markdown_converter.handle(str(soup))
+    return structured_text.strip()
+
+# --- YouTube Transcript Extraction ---
+def sanitize_filename(filename: str) -> str:
+    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
+    filename = filename.replace(' ', '_')
+    filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
+    return filename[:200]
+
+def check_ffmpeg() -> str:
+    try:
+        result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
+        if result.returncode == 0:
+            return 'ffmpeg'
+    except FileNotFoundError:
+        possible_paths = [
+            r'C:\ffmpeg\bin\ffmpeg.exe', r'C:\Program Files\ffmpeg\bin\ffmpeg.exe',
+            r'C:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe', str(Path.home() / 'ffmpeg' / 'bin' / 'ffmpeg.exe'),
+            '/usr/bin/ffmpeg', '/usr/local/bin/ffmpeg', '/opt/homebrew/bin/ffmpeg',
+            str(Path.home() / 'ffmpeg' / 'ffmpeg')
+        ]
+        for path in possible_paths:
+            if os.path.isfile(path):
+                return path
+        raise Exception("FFmpeg not found. Please install it and add it to PATH.")
+
+def download_audio(url: str, output_path: str) -> Optional[str]:
+    try:
+        ffmpeg_path = check_ffmpeg()
+        output_template = os.path.join(output_path, '%(title).100s.%(ext)s')
+        ydl_opts = {
+            'format': 'bestaudio/best',
+            'outtmpl': output_template,
+            'postprocessors': [{
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav',
+                'preferredquality': '192',
+            }],
+            'ffmpeg_location': os.path.dirname(ffmpeg_path) if os.path.dirname(ffmpeg_path) else None,
+            'quiet': True,
+            'no_warnings': True,
+            'restrictfilenames': True
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            safe_title = sanitize_filename(info['title'])
+            audio_file = os.path.join(output_path, f"{safe_title}.wav")
+            if not os.path.exists(audio_file):
+                possible_files = [f for f in os.listdir(output_path) if f.endswith('.wav')]
+                if possible_files:
+                    audio_file = os.path.join(output_path, possible_files[0])
+            return audio_file
+    except Exception as e:
+        print(f"Error downloading audio: {str(e)}")
+        return None
+
+def transcribe_audio(audio_path: str, language: str = "en-US", chunk_size: int = 60000) -> Optional[str]:
+    if not os.path.exists(audio_path):
+        print("Audio file not found")
+        return None
+    recognizer = sr.Recognizer()
+    audio = AudioSegment.from_wav(audio_path)
+    chunks = [audio[start:start + chunk_size] for start in range(0, len(audio), chunk_size)]
+    full_text = []
+    for i, chunk in enumerate(chunks):
+        temp_fd, temp_wav_path = tempfile.mkstemp(suffix=".wav")
+        os.close(temp_fd)
+        try:
+            chunk.export(temp_wav_path, format="wav")
+            with sr.AudioFile(temp_wav_path) as source:
+                audio_data = recognizer.listen(source)
+                try:
+                    text = recognizer.recognize_google(audio_data, language=language)
+                    full_text.append(text)
+                except sr.UnknownValueError:
+                    print(f"Could not understand chunk {i}")
+                except sr.RequestError as e:
+                    print(f"Speech recognition error in chunk {i}: {e}")
+        finally:
+            os.remove(temp_wav_path)
+    return " ".join(full_text).strip()
+
+def generate_transcript(url: str) -> Optional[str]:
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            audio_file = download_audio(url, temp_dir)
+            if not audio_file or not os.path.exists(audio_file):
+                print("Failed to download audio file")
+                return None
+            transcript = transcribe_audio(audio_file)
+            return transcript if transcript else None
+    except Exception as e:
+        print(f"Error generating transcript: {str(e)}")
+        return None
diff --git a/src/model/memory/memory.py b/src/model/memory/memory.py
@@ -96,7 +96,10 @@ class GraphRAGRequest(BaseModel):
     
 class CustomRAGRequest(BaseModel):
     query: str
-
+    
+class TempRAGRequest(BaseModel):
+    website_urls: list
+    youtube_urls: list
 
 # --- Global Variables for Application State ---
 # These global variables store initialized models, runnables, database connections, and chat history.
@@ -904,6 +907,27 @@ async def custom_rag(request: CustomRAGRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     
+@app.post("/url-rag")
+async def url_rag(request: TempRAGRequest):
+    """
+    Endpoint to fetch and format content from URLs.
+
+    Args:
+        request (CustomRAGRequest): Request body containing the query and URLs.
+
+    Returns:
+        dict: A dictionary with a 'context' key containing the formatted content.
+    """
+    context = ""
+    for url in request.website_urls:
+        text = extract_text_from_url(url)
+        context += f"Source: {url}\nInformation: {text}\n\n"
+    for url in request.youtube_urls:
+        transcript = generate_transcript(url)
+        if transcript:
+            context += f"Source: {url}\nInformation: {transcript}\n\n"
+    return {"context": context.strip()}
+    
 # --- Main execution block ---
 if __name__ == "__main__":
     """

Original file line number	Diff line number	Diff line change
`@@ -1017,4 +1017,4 @@ const Chat = () => {`
`1017`	`1017`	`)`
`1018`	`1018`	`}`
`1019`	`1019`
`1020`		`-export default Chat`
	`1020`	`+export default Chat`