codeforpdx
diff --git a/‎backend/pyproject.toml‎
Lines changed: 6 additions & 3 deletions b/‎backend/pyproject.toml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎backend/scripts/create_langsmith_dataset.py‎
Lines changed: 93 additions & 16 deletions b/‎backend/scripts/create_langsmith_dataset.py‎
Lines changed: 93 additions & 16 deletions
diff --git a/‎backend/scripts/langsmith_evaluators.py‎
Lines changed: 19 additions & 11 deletions b/‎backend/scripts/langsmith_evaluators.py‎
Lines changed: 19 additions & 11 deletions
@@ -19,11 +19,13 @@ dependencies = [
   "python-dotenv",
   "pandas>=2.3.0",
   "vertexai>=1.43.0",
-  "langchain>=1.1.0",
-  "langchain-google-vertexai>=3.1.0",
-  "langsmith>=0.4.47",
+  "langchain>=1.1.0,<2.0.0",
+  "langchain-google-vertexai>=3.1.0,<4.0.0",
+  "langsmith>=0.4.47,<0.5.0",
   "langchain-core>=1.1.0",
   "openevals>=0.1.2",
+  "langchain-google-community>=3.0.1",
+  "polars>=1.35.2",
 ]
 
 [tool.setuptools.packages.find]
@@ -49,6 +51,7 @@ dev = [
     "types-Flask>=1.1.6",
     "types-simplejson>=3.20.0.20250326",
     "httpx>=0.27.0",
+    "google-cloud-discoveryengine>=0.15.0",
 ]
 
 gen_convo = [
 
@@ -4,39 +4,83 @@
 for automated evaluation.
 """
 
+import argparse
 import ast
+import os
 from pathlib import Path
+from typing import List, Dict
 
-import pandas as pd
+import polars as pd
 from langsmith import Client
 
+if Path("../.env").exists():
+    from dotenv import load_dotenv
 
-def create_langsmith_dataset():
+    load_dotenv(override=True)
+
+
+def create_langsmith_dataset(
+    input_csv: Path, limit_examples: int, dataset_name: str, overwrite_dataset=False
+):
     """Upload test scenarios to LangSmith for automated evaluation."""
-    client = Client()
+    client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))
 
-    # Read existing test scenarios.
-    csv_path = (
-        Path(__file__).parent
-        / "generate_conversation"
-        / "tenant_questions_facts_full.csv"
-    )
-    df = pd.read_csv(csv_path, encoding="cp1252")
+    # print(client.info)
+
+    dataset_exists = client.has_dataset(dataset_name=dataset_name)
+    if dataset_exists:
+        if overwrite_dataset:
+            print(f"-INFO- Dataset '{dataset_name}' already exists. Deleting for overwrite.")
+            client.delete_dataset(dataset_name=dataset_name)
+        else:
+            raise RuntimeError(
+                f"-ERROR- Dataset '{dataset_name}' already exists. Aborting to avoid duplicates."
+            )
 
     # Create dataset in LangSmith.
     dataset = client.create_dataset(
-        dataset_name="tenant-legal-qa-scenarios",
+        dataset_name=dataset_name,
         description="Test scenarios for Oregon tenant legal advice chatbot",
     )
 
+    # Read existing test scenarios.
+    csv_path = input_csv
+
+    # Try UTF-8 first, fallback to cp1252 if needed
+    try:
+        df = pd.read_csv(csv_path, encoding="utf-8", n_rows=limit_examples)
+    except UnicodeDecodeError:
+        df = pd.read_csv(csv_path, encoding="cp1252", n_rows=limit_examples)
+
+    # replace all empty "city" values with "null" string
+    df["city"].fill_null("null")
+
     # Convert each row to LangSmith example.
-    for idx, row in df.iterrows():
+    for idx, row in enumerate(df.rows(named=True)):
+
         facts = (
             ast.literal_eval(row["facts"])
             if isinstance(row["facts"], str)
             else row["facts"]
         )
-        city = row["city"] if not pd.isna(row["city"]) else "null"
+        city = row["city"] # if not pd.is_null(row["city"]) else "null"
+
+        reference_conversation: List[Dict[str, str]] = []
+        if row.get("Original conversation") is not None:
+            for line in row.get("Original conversation").splitlines():
+                if line.startswith("You:"):
+                    reference_conversation.append(
+                        {"role": "user", "content": line.replace("You:", "").strip()}
+                    )
+                elif line.startswith("Bot:"):
+                    reference_conversation.append(
+                        {"role": "assistant", "content": line.replace("Bot:", "").strip()}
+                    )
+                else:
+                    if line.strip() == "":
+                        continue
+                    reference_conversation[-1]["content"] += "\n" + line.strip()
+                
 
         # Each example has inputs and expected metadata.
         client.create_example(
@@ -46,21 +90,54 @@ def create_langsmith_dataset():
                 "city": city,
                 "state": row["state"],
                 "facts": facts,
+                # "message": reference_conversation
             },
             metadata={
                 "scenario_id": idx,
                 "city": city,
                 "state": row["state"],
                 # Tag scenarios for filtering.
-                "tags": ["tenant-rights", f"city-{city}", f"state-{row['state']}"],
+                "tags": [f"city-{city}", f"state-{row['state']}"],
             },
             # Optionally include reference conversation for comparison.
-            outputs={"reference_conversation": row.get("Original conversation", None)},
+            outputs={"reference_conversation": reference_conversation},
         )
 
     print(f"Created dataset '{dataset.name}' with {len(df)} scenarios")
     return dataset
 
 
 if __name__ == "__main__":
-    create_langsmith_dataset()
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--input-csv",
+        type=Path,
+        default=Path(__file__).parent
+        / "generate_conversation/tenant_questions_facts_full.csv",
+        help="Path to input CSV file",
+    )
+    parser.add_argument(
+        "--limit-examples",
+        type=int,
+        default=None,
+        help="Limit number of examples to upload",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="tenant-legal-qa-scenarios",
+        help="LangSmith dataset name",
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true", help="Overwrite existing dataset"
+    )
+    args = parser.parse_args()
+
+    create_langsmith_dataset(
+        input_csv=args.input_csv,
+        limit_examples=args.limit_examples,
+        dataset_name=args.dataset_name,
+        overwrite_dataset=args.overwrite,
+    )
@@ -5,9 +5,11 @@
 """
 
 import re
+from typing import List, Dict
 
-from langsmith.evaluation import Evaluator
+# from langsmith import SimpleEvaluator as Evaluator
 from openevals import create_llm_as_judge
+from openevals.types import SimpleEvaluator, EvaluatorResult
 from openevals.prompts import CORRECTNESS_PROMPT
 
 EVALUATOR_MODEL_NAME = "gemini-2.5-pro"
@@ -52,7 +54,7 @@
 </Reminders>
 """
 
-citation_accuracy_evaluator: Evaluator = create_llm_as_judge(
+citation_accuracy_evaluator: SimpleEvaluator = create_llm_as_judge(
     model=EVALUATOR_MODEL_NAME,
     prompt=CITATION_PROMPT,
 )
@@ -97,17 +99,23 @@
 </Reminders>
 """
 
-legal_correctness_evaluator: Evaluator = create_llm_as_judge(
+legal_correctness_evaluator: SimpleEvaluator = create_llm_as_judge(
     model=EVALUATOR_MODEL_NAME,
     prompt=LEGAL_CORRECTNESS_PROMPT,
 )
 
 
 # Evaluator 3: Response Completeness (LLM-as-Judge).
-completeness_evaluator: Evaluator = create_llm_as_judge(
-    model=EVALUATOR_MODEL_NAME,
-    prompt=CORRECTNESS_PROMPT,
-)
+def completeness_evaluator(
+    inputs: dict, outputs: dict, reference_outputs: List[Dict[str,str]]
+) -> EvaluatorResult | List[EvaluatorResult]:
+    tmp = create_llm_as_judge(
+        model=EVALUATOR_MODEL_NAME,
+        prompt=CORRECTNESS_PROMPT,
+        feedback_key="completeness",
+    )
+    return tmp(inputs=inputs, outputs=outputs, reference_outputs=reference_outputs)
+
 
 # Evaluator 4: Tone & Professionalism (LLM-as-Judge).
 TONE_PROMPT = """
@@ -141,14 +149,14 @@
 </Reminders>
 """
 
-tone_evaluator: Evaluator = create_llm_as_judge(
+tone_evaluator: SimpleEvaluator = create_llm_as_judge(
     model=EVALUATOR_MODEL_NAME,
     prompt=TONE_PROMPT,
 )
 
 
 # Evaluator 5: Citation Format (Heuristic).
-def citation_format_evaluator(run, example) -> Evaluator:
+def citation_format_evaluator(run, example) -> SimpleEvaluator:
     """Check if citations use proper HTML anchor tag format.
 
     Args:
@@ -188,7 +196,7 @@ def citation_format_evaluator(run, example) -> Evaluator:
 
 
 # Evaluator 6: Tool Usage (Heuristic).
-def tool_usage_evaluator(run, example) -> Evaluator:
+def tool_usage_evaluator(run, example) -> SimpleEvaluator:
     """Check if agent used RAG tools appropriately.
 
     Args:
@@ -219,7 +227,7 @@ def tool_usage_evaluator(run, example) -> Evaluator:
 
 
 # Evaluator 7: Performance Metrics (Heuristic).
-def performance_evaluator(run, example) -> Evaluator:
+def performance_evaluator(run, example) -> SimpleEvaluator:
     """Track latency and token usage.
 
     Args: