context benchmark script and data

ds-jakub-cierocki · ds-jakub-cierocki · commit dd8b339f2a34 · 2024-07-22T11:18:12.000+02:00
diff --git a/benchmark/dbally_benchmark/context_benchmark.py b/benchmark/dbally_benchmark/context_benchmark.py
@@ -0,0 +1,235 @@
+# pylint: disable=missing-return-doc, missing-param-doc, missing-function-docstring
+import dbally
+import asyncio
+import typing
+import json
+import traceback
+import os
+
+import tqdm.asyncio
+import sqlalchemy
+import pydantic
+from typing_extensions import TypeAlias
+from copy import deepcopy
+from sqlalchemy import create_engine
+from sqlalchemy.ext.automap import automap_base, AutomapBase
+from dataclasses import dataclass, field
+
+from dbally import decorators, SqlAlchemyBaseView
+from dbally.audit.event_handlers.cli_event_handler import CLIEventHandler
+from dbally.llms.litellm import LiteLLM
+from dbally.context import BaseCallerContext
+
+
+SQLITE_DB_FILE_REL_PATH = "../../examples/recruiting/data/candidates.db"
+engine = create_engine(f"sqlite:///{os.path.abspath(SQLITE_DB_FILE_REL_PATH)}")
+
+Base: AutomapBase = automap_base()
+Base.prepare(autoload_with=engine)
+
+Candidate = Base.classes.candidates
+
+
+class MyData(BaseCallerContext, pydantic.BaseModel):
+    first_name: str
+    surname: str
+    position: str
+    years_of_experience: int
+    university: str
+    skills: typing.List[str]
+    country: str
+
+
+class OpenPosition(BaseCallerContext, pydantic.BaseModel):
+    position: str
+    min_years_of_experience: int
+    graduated_from_university: str
+    required_skills: typing.List[str]
+
+
+class CandidateView(SqlAlchemyBaseView):
+    """
+    A view for retrieving candidates from the database.
+    """
+
+    def get_select(self) -> sqlalchemy.Select:
+        """
+        Creates the initial SqlAlchemy select object, which will be used to build the query.
+        """
+        return sqlalchemy.select(Candidate)
+
+    @decorators.view_filter()
+    def at_least_experience(self, years: typing.Union[int, OpenPosition]) -> sqlalchemy.ColumnElement:
+        """
+        Filters candidates with at least `years` of experience.
+        """
+        if isinstance(years, OpenPosition):
+            years = years.min_years_of_experience
+
+        return Candidate.years_of_experience >= years
+
+    @decorators.view_filter()
+    def at_most_experience(self, years: typing.Union[int, MyData]) -> sqlalchemy.ColumnElement:
+        if isinstance(years, MyData):
+            years = years.years_of_experience
+
+        return Candidate.years_of_experience <= years
+
+    @decorators.view_filter()
+    def has_position(self, position: typing.Union[str, OpenPosition]) -> sqlalchemy.ColumnElement:
+        if isinstance(position, OpenPosition):
+            position = position.position
+
+        return Candidate.position == position
+
+    @decorators.view_filter()
+    def senior_data_scientist_position(self) -> sqlalchemy.ColumnElement:
+        """
+        Filters candidates that can be considered for a senior data scientist position.
+        """
+        return sqlalchemy.and_(
+            Candidate.position.in_(["Data Scientist", "Machine Learning Engineer", "Data Engineer"]),
+            Candidate.years_of_experience >= 3,
+        )
+
+    @decorators.view_filter()
+    def from_country(self, country: typing.Union[str, MyData]) -> sqlalchemy.ColumnElement:
+        """
+        Filters candidates from a specific country.
+        """
+        if isinstance(country, MyData):
+            return Candidate.country == country.country
+
+        return Candidate.country == country
+
+    @decorators.view_filter()
+    def graduated_from_university(self, university: typing.Union[str, MyData]) -> sqlalchemy.ColumnElement:
+        if isinstance(university, MyData):
+            university = university.university
+
+        return Candidate.university == university
+
+    @decorators.view_filter()
+    def has_skill(self, skill: str) -> sqlalchemy.ColumnElement:
+        return Candidate.skills.like(f"%{skill}%")
+
+    @decorators.view_filter()
+    def knows_data_analysis(self) -> sqlalchemy.ColumnElement:
+        return Candidate.tags.like("%Data Analysis%")
+
+    @decorators.view_filter()
+    def knows_python(self) -> sqlalchemy.ColumnElement:
+        return Candidate.skills.like("%Python%")
+
+    @decorators.view_filter()
+    def first_name_is(self, first_name: typing.Union[str, MyData]) -> sqlalchemy.ColumnElement:
+        if isinstance(first_name, MyData):
+            first_name = first_name.first_name
+
+        return Candidate.name.startswith(first_name)
+
+
+OpenAILLMName: TypeAlias = typing.Literal['gpt-3.5-turbo', 'gpt-4-turbo', 'gpt-4o']
+
+
+def setup_collection(model_name: OpenAILLMName) -> dbally.Collection:
+    llm = LiteLLM(model_name=model_name)
+
+    collection = dbally.create_collection("recruitment", llm)
+    collection.add(CandidateView, lambda: CandidateView(engine))
+
+    return collection
+
+
+async def generate_iql_from_question(
+    collection: dbally.Collection,
+    model_name: OpenAILLMName,
+    question: str,
+    contexts: typing.Optional[typing.List[BaseCallerContext]]
+) -> typing.Tuple[str, OpenAILLMName, typing.Optional[str]]:
+
+    try:
+        result = await collection.ask(
+            question,
+            contexts=contexts,
+            dry_run=True
+        )
+    except Exception as e:
+        exc_pretty = traceback.format_exception_only(e.__class__, e)[0]
+        return question, model_name, f"FAILED: {exc_pretty}"
+
+    out = result.metadata.get("iql")
+    if out is None:
+        return question, model_name, None
+
+    return question, model_name, out.replace('"', '\'')
+
+
+@dataclass
+class BenchmarkConfig:
+    dataset_path: str
+    out_path: str
+    n_repeats: int = 5
+    llms: typing.List[OpenAILLMName] = field(default_factory=lambda: ['gpt-3.5-turbo', 'gpt-4-turbo', 'gpt-4o'])
+
+
+async def main(config: BenchmarkConfig):
+    test_set = None
+    with open(config.dataset_path, 'r') as file:
+        test_set = json.load(file)
+
+    contexts = [
+        MyData(
+            first_name="John",
+            surname="Smith",
+            years_of_experience=4,
+            position="Data Engineer",
+            university="University of Toronto",
+            skills=["Python"],
+            country="United Kingdom"
+        ),
+        OpenPosition(
+            position="Machine Learning Engineer",
+            graduated_from_university="Stanford Univeristy",
+            min_years_of_experience=1,
+            required_skills=["Python", "SQL"]
+        )
+    ]
+
+    tasks: typing.List[asyncio.Task] = []
+    for model_name in config.llms:
+        collection = setup_collection(model_name)
+        for test_case in test_set:
+            answers = []
+            for _ in range(config.n_repeats):
+                task = asyncio.create_task(generate_iql_from_question(collection, model_name,
+                                                                      test_case["question"], contexts=contexts))
+                tasks.append(task)
+
+    output_data = {
+        test_case["question"]:test_case
+        for test_case in test_set
+    }
+    empty_answers = {str(llm_name): [] for llm_name in config.llms}
+
+    total_iter = len(config.llms) * len(test_set) * config.n_repeats
+    for task in tqdm.asyncio.tqdm.as_completed(tasks, total=total_iter):
+        question, llm_name, answer = await task
+        if "answers" not in output_data[question]:
+            output_data[question]["answers"] = deepcopy(empty_answers)
+
+        output_data[question]["answers"][llm_name].append(answer)
+
+    output_data_list = list(output_data.values())
+
+    with open(config.out_path, 'w') as file:
+        file.write(json.dumps(test_set, indent=2))
+
+
+if __name__ == "__main__":
+    config = BenchmarkConfig(
+        dataset_path="dataset/context_dataset.json",
+        out_path="../../context_benchmark_output.json"
+    )
+
+    asyncio.run(main(config))
diff --git a/benchmark/dbally_benchmark/dataset/context_dataset.json b/benchmark/dbally_benchmark/dataset/context_dataset.json
@@ -0,0 +1,62 @@
+[
+  {
+    "question": "Find me French candidates suitable for my position with at least 1 year of experience.",
+    "correct_answer": "from_country('France') AND has_position(AskerContext()) AND at_least_experience(1)",
+    "context": false
+  },
+  {
+    "question": "Please find me candidates from my country who have at most 4 years of experience.",
+    "correct_answer": "from_country(AskerContext()) AND at_most_experience(4)",
+    "context": true
+  },
+  {
+    "question": "Find me candidates who graduated from Stanford University and work as Software Engineers.",
+    "correct_answer": "graduated_from_university('Stanford University') AND has_position('Software Engineer')",
+    "context": false
+  },
+  {
+    "question": "Find me candidates who graduated from my university",
+    "correct_answer": "graduated_from_university(AskerContext())",
+    "context": true
+  },
+  {
+    "question": "Could you find me candidates with at most as experience who also know Python?",
+    "correct_answer": "at_most_experience(AskerContext()) AND know_python()",
+    "context": true
+  },
+  {
+    "question": "Please find me candidates who know Data Analysis and Python",
+    "correct_answer": "know_python() AND know_data_analysis()",
+    "context": false
+  },
+  {
+    "question": "Find me candidates with at least minimal required experience for the currently open position.",
+    "correct_answer": "at_least_experience(AskerContext())",
+    "context": true
+  },
+  {
+    "question": "List candidates with between 2 and 6 years of experience.",
+    "correct_answer": "at_least_experience(2) AND at_most_experience(6)",
+    "context": false
+  },
+  {
+    "question": "Find me candidates who currently have the same position as we look for in our company?",
+    "correct_answer": "has_position(AskerContext())",
+    "context": true
+  },
+  {
+    "question": "Please find me senior data scientist candidates who know Data Analysis and come from my country",
+    "correct_answer": "senior_data_scientist_position() AND has_skill('Data Analysis') AND from_country(AskerContext())",
+    "context": true
+  },
+  {
+    "question": "Find me candidates that have the same first name as me",
+    "correct_answer": "first_name_is(AskerContext())",
+    "context": true
+  },
+  {
+    "question": "List candidates named Mohammed from India",
+    "correct_answer": "first_name_is('Mohammed') AND from_country('India')",
+    "context": false
+  }
+]