diff --git a/backend/scripts/create_vector_store.py b/backend/scripts/create_vector_store.py index c90daf3..3d7d542 100644 --- a/backend/scripts/create_vector_store.py +++ b/backend/scripts/create_vector_store.py @@ -1,16 +1,8 @@ -import os from pathlib import Path from openai import OpenAI +from tenantfirstaid.shared import CONFIG - -if Path(".env").exists(): - from dotenv import load_dotenv - - load_dotenv(override=True) - -API_KEY = os.getenv("OPENAI_API_KEY", os.getenv("GITHUB_API_KEY")) - -client = OpenAI(api_key=API_KEY) +client = OpenAI(api_key=CONFIG.openai_api_key or CONFIG.github_api_key) # Note: we exit if the vector store already exists because # OpenAI does not return the filenames of files in a vector store, @@ -38,11 +30,11 @@ vector_store = client.vector_stores.create(name="Oregon Housing Law") # Get all the files in ./documents - documents_path = Path("./scripts/documents") + documents_path = Path(__file__).parent / "scripts/documents" file_paths = [ f - for f in os.listdir(documents_path) - if os.path.isfile(os.path.join(documents_path, f)) + for f in documents_path.iterdir() + if f.is_file() and f.suffix.lower() in [".txt"] ] if not file_paths: @@ -50,9 +42,7 @@ exit(1) print("Uploading files to vector store...") - file_streams = [ - open(os.path.join(documents_path, path), "rb") for path in file_paths - ] + file_streams = [path.open("rb") for path in file_paths] # Add the files to the vector store file_batch = client.vector_stores.file_batches.upload_and_poll( vector_store_id=vector_store.id, files=file_streams diff --git a/backend/scripts/generate_conversation/chat.py b/backend/scripts/generate_conversation/chat.py old mode 100644 new mode 100755 index 789606b..cd0b7e3 --- a/backend/scripts/generate_conversation/chat.py +++ b/backend/scripts/generate_conversation/chat.py @@ -1,6 +1,8 @@ +#!/usr/bin/env -S uv run --script # /// script -# requires-python = "~=3.11" +# requires-python = "~=3.12" # dependencies = [ +# "dotenv", # "openai", # "pandas", # ] @@ -12,6 +14,7 @@ from pathlib import Path import pandas as pd from typing import Self +# import shared BOT_INSTRUCTIONS = """Pretend you're a legal expert who giving advice about eviction notices in Oregon. diff --git a/backend/scripts/simple_eval.py b/backend/scripts/simple_eval.py index 6911e19..f3b82b9 100644 --- a/backend/scripts/simple_eval.py +++ b/backend/scripts/simple_eval.py @@ -4,24 +4,19 @@ import os from openai import OpenAI -from tenantfirstaid.shared import DEFAULT_INSTRUCTIONS - -API_KEY = os.getenv("OPENAI_API_KEY", os.getenv("GITHUB_API_KEY")) -BASE_URL = os.getenv("MODEL_ENDPOINT", "https://api.openai.com/v1") -MODEL = os.getenv("MODEL_NAME", "o3") -MODEL_REASONING_EFFORT = os.getenv("MODEL_REASONING_EFFORT", "medium") +from tenantfirstaid.shared import CONFIG, DEFAULT_INSTRUCTIONS client = OpenAI( - api_key=API_KEY, - base_url=BASE_URL, + api_key=CONFIG.openai_api_key or CONFIG.github_api_key, + base_url=CONFIG.model_endpoint, ) - -VECTOR_STORE_ID = os.getenv("VECTOR_STORE_ID") openai_tools = [] -if VECTOR_STORE_ID: - openai_tools.append({"type": "file_search", "vector_store_ids": [VECTOR_STORE_ID]}) +if CONFIG.vector_store_id is not None: + openai_tools.append( + {"type": "file_search", "vector_store_ids": [CONFIG.vector_store_id]} + ) # 1. Load the dataset - updated to use path relative to this script script_dir = os.path.dirname(os.path.abspath(__file__)) @@ -47,10 +42,10 @@ # Use the Responses API with streaming response = client.responses.create( - model=MODEL, + model=CONFIG.model_name, input=input_messages, instructions=DEFAULT_INSTRUCTIONS, - reasoning={"effort": MODEL_REASONING_EFFORT}, + reasoning={"effort": CONFIG.model_reasoning_effort}, tools=openai_tools, ) @@ -115,7 +110,7 @@ # 4. Print summary print("\n===== EVALUATION SUMMARY =====") -print(f"Model evaluated: {MODEL}") +print(f"Model evaluated: {CONFIG.model_name}") print(f"Number of samples: {len(samples)}") print(f"Average score: {average_score:.2f}/10") print(f"Average response time: {average_time:.2f} seconds") @@ -129,8 +124,8 @@ with open(results_path, "w") as f: json.dump( { - "model": MODEL, - "reasoning_effort": MODEL_REASONING_EFFORT, + "model": CONFIG.model_name, + "reasoning_effort": CONFIG.model_reasoning_effort, "average_score": average_score, "samples": results, }, diff --git a/backend/tenantfirstaid/app.py b/backend/tenantfirstaid/app.py index a6c1e24..004ec54 100644 --- a/backend/tenantfirstaid/app.py +++ b/backend/tenantfirstaid/app.py @@ -1,14 +1,7 @@ -from pathlib import Path -from flask import Flask, jsonify, session +from flask import Flask, jsonify, session as flask_session import os import secrets - -if Path(".env").exists(): - from dotenv import load_dotenv - - load_dotenv(override=True) - from .chat import ChatView from .session import TenantSession @@ -27,7 +20,7 @@ @app.get("/api/history") def history(): - session_id = session.get("session_id") + session_id = flask_session.get("session_id") if not session_id: return jsonify([]) return jsonify(tenant_session.get(session_id)) @@ -35,7 +28,7 @@ def history(): @app.post("/api/clear-session") def clear_session(): - session.clear() + flask_session.clear() return jsonify({"success": True}) diff --git a/backend/tenantfirstaid/chat.py b/backend/tenantfirstaid/chat.py index e89412f..414ba68 100644 --- a/backend/tenantfirstaid/chat.py +++ b/backend/tenantfirstaid/chat.py @@ -7,28 +7,21 @@ from flask.views import View import os -from .shared import DEFAULT_INSTRUCTIONS, DATA_DIR - -DATA_FILE = DATA_DIR / "chatlog.jsonl" - -API_KEY = os.getenv("OPENAI_API_KEY", os.getenv("GITHUB_API_KEY")) -BASE_URL = os.getenv("MODEL_ENDPOINT", "https://api.openai.com/v1") -MODEL = os.getenv("MODEL_NAME", "o3") -MODEL_REASONING_EFFORT = os.getenv("MODEL_REASONING_EFFORT", "medium") +from .shared import DEFAULT_INSTRUCTIONS, DATA_DIR, CONFIG class ChatView(View): DATA_FILE = DATA_DIR / "chatlog.jsonl" client = OpenAI( - api_key=API_KEY, - base_url=BASE_URL, + api_key=CONFIG.openai_api_key or CONFIG.github_api_key, + base_url=CONFIG.model_endpoint, ) def __init__(self, session): self.session = session - VECTOR_STORE_ID = os.getenv("VECTOR_STORE_ID") + VECTOR_STORE_ID = CONFIG.vector_store_id NUM_FILE_SEARCH_RESULTS = os.getenv("NUM_FILE_SEARCH_RESULTS", 10) self.openai_tools = [] @@ -79,10 +72,10 @@ def generate(): try: # Use the new Responses API with streaming response_stream = self.client.responses.create( - model=MODEL, + model=CONFIG.model_name, input=input_messages, instructions=DEFAULT_INSTRUCTIONS, - reasoning={"effort": MODEL_REASONING_EFFORT}, + reasoning={"effort": CONFIG.model_reasoning_effort}, stream=True, tools=self.openai_tools, ) diff --git a/backend/tenantfirstaid/session.py b/backend/tenantfirstaid/session.py index 3f0a262..e2b33f2 100644 --- a/backend/tenantfirstaid/session.py +++ b/backend/tenantfirstaid/session.py @@ -1,25 +1,23 @@ -import os from valkey import Valkey import simplejson as json +from .shared import CONFIG +from ipaddress import IPv4Address class TenantSession: def __init__(self): + _valkey_args = { + "host": IPv4Address(CONFIG.db_host), + "port": CONFIG.db_port, + "password": CONFIG.db_password, + "ssl": CONFIG.db_use_ssl, + } + print( - "Connecting to Valkey:", - { - "host": os.getenv("DB_HOST"), - "port": os.getenv("DB_PORT"), - "ssl": os.getenv("DB_USE_SSL"), - }, + f"Connecting to Valkey: {_valkey_args}", ) try: - self.db_con = Valkey( - host=os.getenv("DB_HOST", "127.0.0.1"), - port=os.getenv("DB_PORT", 6379), - password=os.getenv("DB_PASSWORD"), - ssl=False if os.getenv("DB_USE_SSL") == "false" else True, - ) + self.db_con = Valkey(**_valkey_args) self.db_con.ping() except Exception as e: diff --git a/backend/tenantfirstaid/shared.py b/backend/tenantfirstaid/shared.py index de2594f..35c4c4d 100644 --- a/backend/tenantfirstaid/shared.py +++ b/backend/tenantfirstaid/shared.py @@ -1,16 +1,77 @@ from collections import defaultdict import os from pathlib import Path +from warnings import warn +from dataclasses import dataclass, field +from typing import Optional CACHE = defaultdict(list) -# Create a dedicated directory for persistent data in root's home directory -if Path(".env").exists(): + +# configuration and secrets are layered in a dataclass. From lowest to highest priority: +# 1. Dataclass defaults +# 2. Environment variables -- typically used by the Docker container +# 3. .env file in the backend directory (if it exists) -- typically used in local development +# TODO: generate/update .env.example from this dataclass +@dataclass(frozen=True) +class Config: + """Configuration for the Oregon Tenant First Aid application.""" + + model_name: str = field(default="o3") + model_reasoning_effort: str = field(default="medium") + vector_store_id: Optional[str] = field(default=None) + feedback_password: Optional[str] = field(default=None) + github_api_key: Optional[str] = field(default=None) + openai_api_key: Optional[str] = field(default=None) + model_endpoint: str = field(default="https://api.openai.com/v1") + use_short_prompts: bool = field(default=True) + db_host: str = field(default="127.0.0.1") + db_port: int = field(default=6379) + db_use_ssl: bool = field(default=True) + db_username: Optional[str] = field(default=None) + db_password: Optional[str] = field(default=None) + + def __post_init__(self): + """Post-initialization to ensure one of the API KEYs is not None.""" + if self.github_api_key is None and self.openai_api_key is None: + raise ValueError( + "Either GITHUB_API_KEY or OPENAI_API_KEY must be set in the environment variables." + ) + + +# For development purposes, we expect the .env file to be in the backend directory +__shared_py_path = Path(__file__).resolve() +__backend_path = __shared_py_path.parent.parent +__dotenv_path = __backend_path / ".env" + +if Path(__dotenv_path).exists(): from dotenv import load_dotenv - load_dotenv(override=True) + print(f"Loading environment variables from {__dotenv_path}") + load_dotenv(dotenv_path=__dotenv_path, override=True) +else: + warn( + f"No .env file found at {__dotenv_path.parent}. Using environment variables from the system." + ) + +# Load environment variables into the Config dataclass +CONFIG = Config( + **{ + field.lower(): val + for field, val in os.environ.items() + if field.lower() in Config.__dataclass_fields__ + } +) +# Create a dedicated directory for persistent data relative to the backend +# directory with a fallback to `/root/tenantfirstaid_data` DATA_DIR = Path(os.getenv("PERSISTENT_STORAGE_DIR", "/root/tenantfirstaid_data")) +if not DATA_DIR.is_absolute(): + new_data_dir = (__backend_path / DATA_DIR).resolve() + warn( + f"DATA_DIR {DATA_DIR} is not an absolute path. It will be relative to the backend directory ({new_data_dir})." + ) + DATA_DIR = new_data_dir DATA_DIR.mkdir(exist_ok=True)