timescale · Askir · Jun 20, 2025
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Script to collect top-level questions from Discord channel for pgai bot evaluation.
+Extracts only the initial questions from threads for benchmarking.
+"""
+
+import asyncio
+import json
+import os
+from datetime import datetime
+
+import discord
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class QuestionCollector(discord.Client):
+    def __init__(self, target_channel_id: int):
+        self.target_channel_id = target_channel_id
+        self.questions: list[str] = []
+        self.collection_done = False
+
+        intents = discord.Intents.default()
+        intents.message_content = True
+        intents.guilds = True
+        super().__init__(intents=intents)
+
+    async def on_ready(self):
+        """Called when the bot is ready."""
+        print(f"Logged in as {self.user}")
+        await self.collect_questions()
+        self.collection_done = True
+        await self.close()
+
+    async def collect_questions(self) -> list[str]:
+        """Collect top-level questions from the Discord channel."""
+        print("Collecting questions...")
+
+        # Get the channel
+        channel = None
+        for guild in self.guilds:
+            channel = guild.get_channel(self.target_channel_id)
+            if channel:
+                break
+
+        if not channel:
+            print(f"Channel {self.target_channel_id} not found")
+            return []
+
+        print(f"Collecting from channel: {channel.name}")
+
+        if not isinstance(channel, discord.TextChannel):
+            print("Channel is not a text channel")
+            return []
+
+        # Collect only messages that create threads (top-level questions)
+        try:
+            async for message in channel.history(limit=None):
+                # Only collect messages that:
+                # 1. Are not from bots
+                # 2. Have content
+                # 3. Have a thread (meaning they started a conversation)
+                if (
+                    not message.author.bot
+                    and message.content.strip()
+                    and message.thread is not None
+                ):
+                    self.questions.append(message.content.strip())
+                    print(f"Found question: {message.content[:50]}...")
+        except Exception as e:
+            print(f"Error fetching channel messages: {e}")
+
+        print(f"Collected {len(self.questions)} questions")
+        return self.questions
+
+
+async def main():
+    """Main function to run the question collection."""
+    bot_token = os.environ.get("DISCORD_BOT_TOKEN")
+    if not bot_token:
+        print("Error: DISCORD_BOT_TOKEN not found in environment")
+        return
+
+    channel_id = 1331981876319223879
+
+    collector = QuestionCollector(channel_id)
+
+    # Start the bot in a task
+    bot_task = asyncio.create_task(collector.start(bot_token))
+
+    # Wait for collection to complete with timeout
+    start_time = asyncio.get_event_loop().time()
+    timeout = 30  # 30 seconds timeout
+
+    print("Waiting for collection to complete...")
+    while not collector.collection_done:
+        elapsed = asyncio.get_event_loop().time() - start_time
+        if elapsed > timeout:
+            print(f"Timeout after {timeout} seconds")
+            collector.collection_done = True
+            break
+        await asyncio.sleep(1)
+
+    # Cancel the bot task if it's still running
+    if not bot_task.done():
+        bot_task.cancel()
+        try:
+            await bot_task
+        except asyncio.CancelledError:
+            pass
+
+    questions = collector.questions
+
+    if questions:
+        # Save to JSON file
+        output_file = (
+            f"discord_questions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        )
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(questions, f, indent=2, ensure_ascii=False)
+
+        print(f"Questions saved to {output_file}")
+        print(f"Total questions collected: {len(questions)}")
+
+        # Show first few questions
+        print("\nFirst few questions:")
+        for i, question in enumerate(questions[:5]):
+            preview = question[:100] + "..." if len(question) > 100 else question
+            print(f"{i+1}. {preview}")
+    else:
+        print("No questions collected")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,35 @@
+[
+  "Can you run pgai without docker on windows?",
+  "I was wondering for the following information. \n\nDoes the PDF parser also include images? \n\nDoes the embedding take these images into account? \n\nHow to enable this?",
+  "how do i add a vectorizer to an existing table",
+  "<#1331981876319223879> how to optimize pgvectorscale on a postgres17 timescaledb with a database of 50M+ rows with embeddings of 1024. I already use matryoshka of 512",
+  "<#1331981876319223879>  how do i delete a vectorizer",
+  "<#1331981876319223879> When using the parsing options like docling do I have the posibility to select the output format after the parsing? Or is the markdown output format by default for everything?",
+  "<#1331981876319223879> When using Docling, are the models used to parse documents stored locally in a self-hosted Postgres database?",
+  "In pg_ai, is it possible to explicitly provide embedding (eg in unit tests), so it's not automatically generated?",
+  "fix: ```2025-06-05 12:48:28 [debug    ]     | docling.exceptions.ConversionError: File format not allowed: uploads/1/AmazonQDev.pdf ``` in worker",
+  "ERROR: Could not find a version that satisfies the requirement pgai (from versions: none) How can I solve this error when I install pgai by this command?  pip install pgai",
+  "IN my cloudberry environment, postgresql version is 14. How can I install pgvectorscale when requirement is postgresql version 16?",
+  "what is your experience in chunking strategies? What are some good working strategies and what strategies work less good?",
+  "generate_rag_response\n\nHow to implement this function",
+  "I have created a storage bucket for files right now, how to create a well function vectorizer for this",
+  "im getting this error: \n\n[42883] ERROR: function ai. openai_embed(text, text, api_key_name => text, dimensions => integer, openai_user => text) does not exist Hint: No function matches the given name and argument types. You might need to add explicit type casts. Where: PL/ pgSQL function ai. vectorizer_embed(jsonb,text,text) line 7 at assignment SQL function \"vectorizer_embed\" statement 1\n\n\nSELECT\n    chunk,\n    embedding <=>  ai.vectorizer_embed(2, 'fraude', 'string') as distance\nFROM risk_embeddings\nORDER BY distance;\n\nFor this",
+  "Hi",
+  "How does it work with different types of data that I should parse. Lets say I have one huge document table, which can either raw text, a file document (docx, pdf etc), or both. How does the embedding work for that?",
+  "How does the pgai s3 integration work?",
+  "```\nn8n-db=# SELECT ai.create_vectorizer(\n     'blog'::regclass,\n     loading => ai.loading_column('contents'),\n     embedding => ai.embedding_ollama('nomic-embed-text', 768),\n     destination => ai.destination_table('blog_contents_embeddings')\n);\nERROR:  function ai.loading_column(unknown) does not exist\nLINE 3:      loading => ai.loading_column('contents'),\n                        ^\nHINT:  No function matches the given name and argument types. You might need to add explicit type casts.\n```\nI am getting this error even if the function is there, when i run this query\n`SELECT routine_name, routine_type FROM information_schema.routines WHERE routine_schema = 'ai'`",
+  "which version of pgai has the text-to-sql feature included ?",
+  "can i use any hugging face model in pgai?",
+  "How to calculate size of index in pgvector scale",
+  "hello please share the link to download the library pgai vectorizer",
+  "Is there a way to use PGAI with Gemini?",
+  "Is it normal for pgvectorscale streamingdisk ann to take 3 hours to build the index for 12 million rows with 75 dimensions vector?",
+  "Is there a way to vectorize multiple columns of a table? for exampe, concating some columns into a single string.",
+  "Hello, is it possible to store chunks in the source table instead of the full documents and disable chunking in the vectorizer configuration?",
+  "<@1326487316499529760> Installed pgai, am I able to make responses on endpoint?\n",
+  "Installed pgai, am I able to make responses on endpoint?",
+  "Tell me about yourself <@&1249549447000490075>",
+  "How do I create a vectorizer with voyage ai embeddings?",
+  "How can I use pgai with sqlalchemy?",
+  "Hello World?"
+]
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+Evaluation script for pgai Discord bot.
+Tests the bot's RAG pipeline against a list of questions and uses an LLM judge to evaluate responses.
+"""
+
+import asyncio
+import json
+import logging
+import sys
+
+from dotenv import load_dotenv
+
+# Import the refactored functions from main.py
+from pgai_discord_bot.main import (
+    generate_rag_response,
+    openai_client,
+    retrieve_relevant_documents,
+)
+
+load_dotenv()
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+async def evaluate_response_with_llm(
+    question: str, bot_response: str, relevant_docs: str
+) -> dict:
+    """Use an LLM judge to evaluate if the bot response adequately answers the question."""
+    judge_prompt = f"""
+    You are an expert evaluator for a technical documentation chatbot. Your task is to evaluate whether a bot response adequately answers a user's question about pgai (a PostgreSQL AI extension).
+
+    USER QUESTION: {question}
+
+    BOT RESPONSE: {bot_response}
+
+    RELEVANT DOCUMENTATION USED: {relevant_docs}
+
+    Please evaluate the response on these criteria:
+    1. ACCURACY: Is the information provided factually correct based on the documentation?
+    2. COMPLETENESS: Does the response adequately address the user's question?
+    3. CLARITY: Is the response clear and easy to understand?
+    4. RELEVANCE: Is the response relevant to the question asked?
+    5. HELPFULNESS: Would this response help the user solve their problem?
+
+    Provide your evaluation in the following JSON format:
+    {{
+        "overall_score": <score from 1-10>,
+        "accuracy": <score from 1-10>,
+        "completeness": <score from 1-10>, 
+        "clarity": <score from 1-10>,
+        "relevance": <score from 1-10>,
+        "helpfulness": <score from 1-10>,
+        "reasoning": "<brief explanation of your evaluation>",
+        "issues": "<any specific issues you identified>",
+        "strengths": "<any specific strengths you identified>"
+    }}
+    """
+
+    chat_completion = await openai_client.chat.completions.create(
+        messages=[{"content": judge_prompt, "role": "user"}],
+        model="gpt-4o",
+        temperature=0.1,
+    )
+
+    try:
+        # Extract JSON from the response
+        response_content = chat_completion.choices[0].message.content or ""
+        # Find JSON in the response (in case there's extra text)
+        start_idx = response_content.find("{")
+        end_idx = response_content.rfind("}") + 1
+        json_str = response_content[start_idx:end_idx]
+        return json.loads(json_str)
+    except (json.JSONDecodeError, ValueError) as e:
+        logger.error(f"Failed to parse LLM judge response: {e}")
+        return {
+            "overall_score": 0,
+            "accuracy": 0,
+            "completeness": 0,
+            "clarity": 0,
+            "relevance": 0,
+            "helpfulness": 0,
+            "reasoning": "Failed to parse evaluation",
+            "issues": f"JSON parsing error: {e}",
+            "strengths": "N/A",
+        }
+
+
+async def run_evaluation(questions_file: str, max_questions: int = None) -> None:
+    """Run the complete evaluation pipeline."""
+    logger.info("Starting evaluation...")
+
+    # Load questions
+    with open(questions_file, encoding="utf-8") as f:
+        questions = json.load(f)
+
+    if max_questions:
+        questions = questions[:max_questions]
+        logger.info(f"Running evaluation on first {len(questions)} questions")
+    else:
+        logger.info(f"Loaded {len(questions)} questions")
+
+    results = []
+
+    for i, question in enumerate(questions, 1):
+        logger.info(f"Processing question {i}/{len(questions)}: {question[:100]}...")
+
+        try:
+            # Get relevant docs
+            relevant_docs = await retrieve_relevant_documents(question)
+
+            # Generate bot response using the refactored function
+            bot_response = await generate_rag_response(question)
+
+            # Evaluate with LLM judge
+            evaluation = await evaluate_response_with_llm(
+                question, bot_response, relevant_docs
+            )
+
+            result = {
+                "question_id": i,
+                "question": question,
+                "bot_response": bot_response,
+                "relevant_docs": relevant_docs,
+                "evaluation": evaluation,
+            }
+
+            results.append(result)
+
+            logger.info(
+                f"Question {i} - Overall Score: {evaluation.get('overall_score', 'N/A')}"
+            )
+
+        except Exception as e:
+            logger.error(f"Error processing question {i}: {e}")
+            results.append(
+                {
+                    "question_id": i,
+                    "question": question,
+                    "bot_response": f"ERROR: {str(e)}",
+                    "relevant_docs": "",
+                    "evaluation": {
+                        "overall_score": 0,
+                        "reasoning": f"Processing error: {e}",
+                        "issues": str(e),
+                        "accuracy": 0,
+                        "completeness": 0,
+                        "clarity": 0,
+                        "relevance": 0,
+                        "helpfulness": 0,
+                        "strengths": "N/A",
+                    },
+                }
+            )
+
+    # Save results
+    output_file = "eval_results.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+    # Print summary
+    scores = [r["evaluation"].get("overall_score", 0) for r in results]
+    avg_score = sum(scores) / len(scores) if scores else 0
+
+    print("\n=== EVALUATION SUMMARY ===")
+    print(f"Total questions: {len(questions)}")
+    print(f"Average overall score: {avg_score:.2f}/10")
+    print("Score distribution:")
+    print(f"  - Excellent (9-10): {sum(1 for s in scores if s >= 9)}")
+    print(f"  - Good (7-8): {sum(1 for s in scores if 7 <= s < 9)}")
+    print(f"  - Fair (5-6): {sum(1 for s in scores if 5 <= s < 7)}")
+    print(f"  - Poor (1-4): {sum(1 for s in scores if 1 <= s < 5)}")
+    print(f"  - Failed (0): {sum(1 for s in scores if s == 0)}")
+    print(f"\nDetailed results saved to: {output_file}")
+
+
+if __name__ == "__main__":
+    max_questions = None
+    if len(sys.argv) > 2:
+        try:
+            max_questions = int(sys.argv[2])
+        except ValueError:
+            print("Usage: python eval.py <questions_file> [max_questions]")
+            sys.exit(1)
+
+    questions_file = (
+        sys.argv[1] if len(sys.argv) > 1 else "discord_questions_20250619_151250.json"
+    )
+    asyncio.run(run_evaluation(questions_file, max_questions))
@@ -23,7 +23,7 @@
 
 
 def upgrade() -> None:
-    op.execute("CREATE EXTENSION IF NOT EXISTS ai CASCADE;")
+    op.execute("CREATE EXTENSION IF NOT EXISTS ai VERSION '0.8.0' CASCADE;")
     op.create_vectorizer(
         source="documents",
         embedding=EmbeddingOpenaiConfig(model="text-embedding-3-small", dimensions=768),