version 2.2.0 ready to fix the database problem

21pounder · 21pounder · commit 21423809361c · 2026-02-23T10:27:37.000+08:00
diff --git a/AGENTS.md b/AGENTS.md
@@ -1,57 +1,47 @@
 # Repository Guidelines
 
 ## Project Structure & Module Organization
-- `backend/`: FastAPI service and data-to-spec pipeline (`main.py`, `analyzer.py`, `spec_generator.py`, `llm.py`).
-- `frontend/`: React + TypeScript + Vite UI (`src/components`, `src/lib/catalog.ts`, `src/App.tsx`).
-- `sample_data/`: local datasets for manual validation (CSV/TSV/JSON/XLSX).
-- Root docs: `json-render-docs.md`, `quickstart.md`.
-
-Keep backend logic data-focused (analysis/spec generation) and frontend logic presentation-focused (renderer and components).
+- `backend/`: FastAPI service and analysis pipeline (`main.py`, `analyzer.py`, `llm.py`, `db.py`, `nl2sql.py`).
+- `frontend/`: React 19 + TypeScript app (Vite). UI components live in `frontend/src/components/`, JSON catalog/registry in `frontend/src/lib/` and `frontend/src/components/registry.tsx`.
+- `sample_data/`: local datasets for smoke testing uploads.
+- `resource/`: demo media assets.
+- `.github/workflows/pylint.yml`: Python lint CI job.
 
 ## Build, Test, and Development Commands
-- Backend setup/run:
-```powershell
-cd backend
-python -m venv .venv
-.\.venv\Scripts\Activate.ps1
-pip install -r requirements.txt
-uvicorn main:app --reload --port 8000
-```
-- Frontend setup/run:
-```powershell
-cd frontend
-npm install
-npm run dev
-```
+- Backend setup:
+  - `cd backend && pip install -r requirements.txt`
+  - `uvicorn main:app --reload --port 8000` (runs API at `http://localhost:8000`)
+- Frontend setup:
+  - `cd frontend && npm install`
+  - `npm run dev` (local app at `http://localhost:5173`)
 - Frontend quality/build:
-```powershell
-npm run lint     # ESLint on TS/TSX
-npm run build    # Type-check + production build
-npm run preview  # Preview built app
-```
+  - `npm run lint` (ESLint for `ts/tsx`)
+  - `npm run build` (TypeScript compile + Vite production build)
+  - `npm run preview` (serve built app)
 
 ## Coding Style & Naming Conventions
-- Python: follow PEP 8, 4-space indentation, `snake_case` for functions/variables, small focused helpers.
-- TypeScript/React: 2-space indentation, `PascalCase` for components (`StatCard.tsx`), `camelCase` for functions/props.
-- Keep component schemas and registry aligned with `frontend/src/lib/catalog.ts` and `frontend/src/components/registry.tsx`.
-- Run `npm run lint` before opening a PR.
+- Python: 4-space indentation, `snake_case` for functions/variables, small focused functions.
+- TypeScript/React: `PascalCase` for components (`StatCard.tsx`), `camelCase` for helpers/hooks.
+- Keep component contracts aligned with the JSON render catalog (`BarChart`, `LineChart`, `PieChart`, etc.).
+- Run `npm run lint` before opening a PR; keep imports and unused variables clean.
 
 ## Testing Guidelines
-- No automated test suite is currently committed.
-- Minimum expectation: manual smoke test with both servers running and at least one file from `sample_data/` uploaded.
-- New tests are encouraged:
-  - Backend: `backend/tests/test_*.py` with `pytest`.
-  - Frontend: `*.test.tsx` near components or under `frontend/src/__tests__/`.
+- There is no committed unit-test suite yet for backend or frontend.
+- Minimum check before PR:
+  - `npm run lint`
+  - `npm run build`
+  - Manual smoke test: upload at least one file from `sample_data/` and verify dashboard rendering.
+- For backend logic changes, add targeted tests when introducing non-trivial parsing/query behavior.
 
 ## Commit & Pull Request Guidelines
-- Existing history uses short, release-style messages (example: `version 1.0.4 supportAllKindOfData`).
-- Prefer concise commits with clear scope, e.g. `frontend: improve chart legend layout`.
+- Current history is release-oriented (`version <semver> <note>`, e.g., `version 2.1.0 support personal database analyze`).
+- Prefer concise, imperative commit subjects; include scope when useful (e.g., `backend: tighten SQL guard`).
 - PRs should include:
-  - What changed and why.
-  - Manual verification steps.
-  - Screenshots/GIFs for UI changes.
-  - Any config/env updates.
+  - What changed and why
+  - Manual verification steps
+  - UI screenshots/GIFs for frontend changes
+  - Linked issue/task reference
 
 ## Security & Configuration Tips
-- Configure secrets in `backend/.env` (`LLM_API_KEY`, `LLM_BASE_URL`, optional `LLM_MODEL`).
-- Never commit API keys or sensitive datasets.
+- Use `backend/.env` for `LLM_API_KEY`, `LLM_BASE_URL`, `LLM_MODEL`, and DB credentials (`DB_HOST`, `DB_PORT`, `DB_USER`, `DB_PASSWORD`, `DB_NAME`).
+- Never commit secrets. Keep generated metadata (`backend/db_meta.json`) out of version control.
diff --git a/backend/db.py b/backend/db.py
@@ -1,12 +1,17 @@
 import os
 import json
+import re
 import pymysql
 import pandas as pd
 from dotenv import load_dotenv
 
 load_dotenv()
 
 META_PATH = os.path.join(os.path.dirname(__file__), "db_meta.json")
+FORBIDDEN_SQL_PATTERN = re.compile(
+    r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|TRUNCATE|CREATE|REPLACE|GRANT|REVOKE)\b",
+    flags=re.IGNORECASE,
+)
 
 
 def get_connection():
@@ -43,7 +48,6 @@ def scan_schema():
         # sample values
         cur.execute(f"SELECT * FROM `{table}` LIMIT 3")
         sample_rows = cur.fetchall()
-        col_names = [c["name"] for c in columns]
         for i, c in enumerate(columns):
             c["sample"] = [row[i] for row in sample_rows if row[i] is not None]
 
@@ -75,14 +79,46 @@ def get_meta():
     return scan_schema()
 
 
-def execute_query(sql: str) -> pd.DataFrame:
-    stripped = sql.strip().rstrip(";").strip()
-    if not stripped.upper().startswith("SELECT"):
+def normalize_sql(sql: str) -> str:
+    stripped = (sql or "").strip().rstrip(";").strip()
+    # Remove leading SQL comments before validation.
+    cleaned = re.sub(r"^(--[^\n]*\n|/\*.*?\*/\s*)*", "", stripped, flags=re.DOTALL).strip()
+    return cleaned
+
+
+def validate_select_sql(sql: str, require_from: bool = False) -> str:
+    cleaned = normalize_sql(sql)
+    if not cleaned:
+        raise ValueError("SQL is empty")
+    if ";" in cleaned:
+        raise ValueError("Multiple SQL statements are not allowed")
+
+    first_word = cleaned.split()[0].upper() if cleaned.split() else ""
+    if first_word not in ("SELECT", "WITH"):
         raise ValueError("Only SELECT queries are allowed")
 
+    if FORBIDDEN_SQL_PATTERN.search(cleaned):
+        raise ValueError("Only read-only SELECT queries are allowed")
+
+    # Reject placeholder outputs often produced by LLMs.
+    if re.search(r"\bSELECT\s+statement\.?\b", cleaned, flags=re.IGNORECASE):
+        raise ValueError("LLM returned a placeholder SQL instead of a real query")
+
+    if require_from and first_word == "SELECT" and "FROM" not in cleaned.upper():
+        raise ValueError("Generated SQL must include a FROM clause")
+
+    return cleaned
+
+
+def execute_query(sql: str) -> pd.DataFrame:
+    cleaned = validate_select_sql(sql)
+
     conn = get_connection()
     try:
-        df = pd.read_sql(sql, conn)
-        return df.head(5000)
+        with conn.cursor() as cur:
+            cur.execute(cleaned)
+            rows = cur.fetchmany(5000)
+            columns = [desc[0] for desc in (cur.description or [])]
+        return pd.DataFrame(rows, columns=columns)
     finally:
         conn.close()
diff --git a/backend/e2b_runner.py b/backend/e2b_runner.py
@@ -0,0 +1,123 @@
+import os
+import json
+import base64
+import httpx
+from openai import OpenAI
+from dotenv import load_dotenv
+from e2b_code_interpreter import Sandbox
+
+load_dotenv()
+
+client = OpenAI(
+    api_key=os.getenv("LLM_API_KEY"),
+    base_url=os.getenv("LLM_BASE_URL"),
+    http_client=httpx.Client(trust_env=False),
+)
+
+MAX_STEPS = 6
+
+AGENT_PROMPT = """You are a Deep Research Agent for data analysis. You work iteratively: plan, execute code, observe results, then decide next steps.
+
+You have a dataset at '/tmp/data.csv'. You are inside a Python sandbox with pandas, matplotlib, scipy, sklearn available.
+
+At each step, respond with a JSON object:
+{
+  "thought": "What I learned and what I want to do next",
+  "code": "python code to execute (or empty string if done)",
+  "done": false
+}
+
+When your analysis is complete, set done=true and include a final summary:
+{
+  "thought": "Final comprehensive summary of all findings",
+  "code": "",
+  "done": true
+}
+
+Rules:
+1. Start by exploring the data (shape, columns, dtypes, basic stats).
+2. Each step should build on previous results. Don't repeat work.
+3. Save charts to '/tmp/chart_N.png' (increment N). Use plt.savefig() then plt.close().
+4. Print results so you can observe them in the next step.
+5. If code errors, fix it in the next step.
+6. Use the SAME LANGUAGE as the user question for all text output.
+7. Output ONLY valid JSON, no other text."""
+
+
+def _parse_response(text: str) -> dict:
+    text = text.strip()
+    if "```" in text:
+        text = text.split("```")[1]
+        if text.startswith("json"):
+            text = text[4:]
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        return {"thought": text, "code": "", "done": True}
+
+
+def _run_code(sbx: Sandbox, code: str, chart_idx: int) -> dict:
+    execution = sbx.run_code(code)
+    stdout = "\n".join(execution.logs.stdout) if execution.logs.stdout else ""
+    result_text = execution.text or ""
+    output = (stdout + "\n" + result_text).strip()
+    error = execution.error.value if execution.error else ""
+
+    charts = []
+    for i in range(chart_idx, chart_idx + 5):
+        try:
+            content = sbx.files.read(f"/tmp/chart_{i}.png", format="bytes")
+            charts.append(base64.b64encode(content).decode())
+        except Exception:
+            break
+
+    return {"output": output, "error": error, "charts": charts}
+
+
+def deep_analyze(data_csv: str, question: str) -> dict:
+    sbx = Sandbox.create()
+    steps = []
+    chart_idx = 0
+
+    try:
+        sbx.files.write("/tmp/data.csv", data_csv)
+        messages = [
+            {"role": "system", "content": AGENT_PROMPT},
+            {"role": "user", "content": f"Dataset preview:\n{data_csv[:5000]}\n\nQuestion: {question}"},
+        ]
+
+        for step_num in range(MAX_STEPS):
+            resp = client.chat.completions.create(
+                model=os.getenv("LLM_MODEL", "claude-opus-4-6-thinking"),
+                messages=messages,
+                max_tokens=2048,
+            )
+            reply = resp.choices[0].message.content
+            parsed = _parse_response(reply)
+
+            step = {"step": step_num + 1, "thought": parsed.get("thought", "")}
+
+            if parsed.get("done") or not parsed.get("code", "").strip():
+                steps.append(step)
+                break
+
+            result = _run_code(sbx, parsed["code"], chart_idx)
+            chart_idx += len(result["charts"])
+
+            step["code"] = parsed["code"]
+            step["output"] = result["output"]
+            step["error"] = result["error"]
+            step["charts"] = result["charts"]
+            steps.append(step)
+
+            # Feed results back to LLM
+            observation = f"Output:\n{result['output']}"
+            if result["error"]:
+                observation += f"\nError:\n{result['error']}"
+            messages.append({"role": "assistant", "content": reply})
+            messages.append({"role": "user", "content": observation})
+
+    finally:
+        sbx.kill()
+
+    return {"steps": steps}
diff --git a/backend/main.py b/backend/main.py
@@ -1,11 +1,13 @@
-from fastapi import FastAPI, UploadFile, Query
+from fastapi import FastAPI, Query, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+
 from analyzer import analyze
-from spec_generator import generate_spec
+from db import execute_query, scan_schema
+from e2b_runner import deep_analyze
 from llm import generate_spec_with_llm
-from db import scan_schema, execute_query
 from nl2sql import generate_recommendations, nl_to_sql
+from spec_generator import generate_spec
 
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
@@ -15,6 +17,15 @@ class QueryRequest(BaseModel):
     question: str
 
 
+class SqlRequest(BaseModel):
+    sql: str
+
+
+class DeepRequest(BaseModel):
+    data_csv: str
+    question: str
+
+
 @app.post("/analyze")
 async def analyze_file(file: UploadFile, mode: str = Query("ai")):
     data = await file.read()
@@ -44,8 +55,24 @@ async def db_recommend():
 
 @app.post("/db/query")
 async def db_query(req: QueryRequest):
-    sql = nl_to_sql(req.question)
-    df = execute_query(sql)
+    try:
+        sql = nl_to_sql(req.question)
+    except ValueError as e:
+        return {
+            "error": (
+                "Cannot convert this question to executable SQL. "
+                "Ask a more concrete data query.\n"
+                f"Reason: {e}"
+            )
+        }
+
+    print(f"[NL2SQL] question: {req.question}")
+    print(f"[NL2SQL] generated sql: {repr(sql)}")
+    try:
+        df = execute_query(sql)
+    except Exception as e:
+        return {"error": f"SQL execution failed: {e}"}
+
     data = df.to_csv(index=False).encode("utf-8")
     analysis = analyze(data, "query_result.csv")
     try:
@@ -54,3 +81,26 @@ async def db_query(req: QueryRequest):
         print(f"LLM failed, falling back to template: {e}")
         spec = generate_spec(analysis)
     return {"spec": spec, "analysis": analysis, "sql": sql}
+
+
+@app.post("/db/query-sql")
+async def db_query_sql(req: SqlRequest):
+    try:
+        df = execute_query(req.sql)
+    except Exception as e:
+        return {"error": f"SQL execution failed: {e}"}
+
+    data = df.to_csv(index=False).encode("utf-8")
+    analysis = analyze(data, "query_result.csv")
+    try:
+        spec = generate_spec_with_llm(analysis)
+    except Exception as e:
+        print(f"LLM failed, falling back to template: {e}")
+        spec = generate_spec(analysis)
+    return {"spec": spec, "analysis": analysis, "sql": req.sql}
+
+
+@app.post("/analyze/deep")
+async def analyze_deep(req: DeepRequest):
+    result = deep_analyze(req.data_csv, req.question)
+    return result
diff --git a/backend/nl2sql.py b/backend/nl2sql.py
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx