georgia-tech-db · NB670 · Aug 27, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 26, 2025
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "cmake.sourceDirectory": "/Users/neelb/Documents/TokenSmith/llama.cpp"
+}
diff --git a/generator.py b/generator.py
@@ -0,0 +1,82 @@
+import os, subprocess, textwrap
+
+LLAMA_CPP_BINARY = os.getenv("LLAMA_CPP_BIN", "/Users/aj/git/llama.cpp/build/bin/llama-cli")
+
+ANSWER_START = "<<<ANSWER>>>"
+ANSWER_END   = "<<<END>>>"
+
+def format_prompt(chunks, query, max_chunk_chars=400):
+    # smaller chunks = less repetition fuel
+    trimmed = [(c or "")[:max_chunk_chars] for c in chunks]
+    context = "\n\n".join(trimmed)
+    return textwrap.dedent(f"""\
+        <|im_start|>system
+        You are a concise tutor. Use the textbook excerpts to answer in 2–3 sentences.
+        If the excerpts are insufficient, say so briefly.
+        End your reply with {ANSWER_END}.
+        <|im_end|>
+        <|im_start|>user
+        Textbook Excerpts:
+        {context}
+
+        Question: {query}
+        <|im_end|>
+        <|im_start|>assistant
+        {ANSWER_START}
+    """)
+
+def _extract_answer(raw: str) -> str:
+    # take everything after the last START marker, then cut at END
+    text = raw.split(ANSWER_START)[-1]
+    return text.split(ANSWER_END)[0].strip()
+
+def run_llama_cpp(prompt: str, model_path: str, max_tokens: int = 300,
+                  threads: int = 8, n_gpu_layers: int = 32, temperature: float = 0.3):
+    cmd = [
+        LLAMA_CPP_BINARY,
+        "-m", model_path,
+        "-p", prompt,
+        "-n", str(max_tokens),
+        "-t", str(threads),
+        "--n-gpu-layers", str(n_gpu_layers),
+        "--temp", str(temperature),
+        "--top-k", "40",
+        "--top-p", "0.9",
+        "--min-p", "0.05",
+        "--typical", "1.0",
+        "--repeat-penalty", "1.15",
+        "--repeat-last-n", "256",
+        "--mirostat", "2",              # stabilizes length/quality
+        "--mirostat-ent", "5",
+        "--mirostat-lr", "0.1",
+        "--no-mmap",
+        "-no-cnv",
+        "-r", ANSWER_END,               # hard stop at <<<END>>>
+    ]
+    # capture BOTH streams (some builds stream tokens on stderr)
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env={**os.environ, "GGML_LOG_LEVEL": "ERROR", "LLAMA_LOG_LEVEL": "ERROR"},
+    )
+    out, _ = proc.communicate()
+    return _extract_answer(out or "")
+
+def _dedupe_sentences(text: str) -> str:
+    # simple consecutive-sentence de-dupe
+    import re
+    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
+    cleaned = []
+    for s in sents:
+        if not cleaned or s.lower() != cleaned[-1].lower():
+            cleaned.append(s)
+    return " ".join(cleaned)
+
+def answer(query: str, chunks, model_path: str, max_tokens: int = 300, **kw):
+    prompt = format_prompt(chunks, query)
+    approx_tokens = max(1, len(prompt) // 4)
+    print(f"\n⚙️  Prompt length ≈ {approx_tokens} tokens\n")
+    raw = run_llama_cpp(prompt, model_path, max_tokens=max_tokens, **kw)
+    return _dedupe_sentences(raw)
diff --git a/llama.cpp b/llama.cpp
diff --git a/src/generator.py b/src/generator.py
@@ -151,3 +151,32 @@ def answer(query: str, chunks, model_path: str, max_tokens: int = 300, **kw):
     print(f"\n⚙️  Prompt length ≈ {approx_tokens} tokens\n")
     raw = run_llama_cpp(prompt, model_path, max_tokens=max_tokens, **kw)
     return _dedupe_sentences(raw)
+
+def extract_answer_number(text: str) -> int | None:
+    print(f"Raw model output for difficulty rating: {text!r}")
+    m = re.search(r'Answer:\s*([1-5])', text)
+    if m:
+        return int(m.group(1))
+    return None
+
+def is_question_hard_with_model(query: str, model_path: str, **kw) -> bool:
+    prompt = (
+        """You are an expert in databases. 
+        Rate the following question’s difficulty on a scale from 1 (easy) to 5 (hard), where:
+
+        1 = very easy (straightforward definition, fact recall, or yes/no question)
+        2 = easy (basic concept explanation or simple example)
+        3 = medium (requires some reasoning or combining multiple concepts)
+        4 = hard (multi-step reasoning, trade-offs, or applied problem-solving)
+        5 = very hard (open-ended design, analysis under constraints, or advanced research-level reasoning)"""
+        "Respond in the format Answer: <rating>\n\n"
+        f"Question: {query}\n"
+        "Answer:"
+    )
+    response = extract_answer_number(run_llama_cpp(prompt, model_path, max_tokens=3, temperature=0.1, **kw))
+    print(f"Response: {response!r}")
+
+    if not response:
+        return False
+    rating = int(response)
+    return rating >= 4
diff --git a/src/main.py b/src/main.py
@@ -8,7 +8,7 @@
 from src.ranking.rankers import FaissSimilarityRanker, BM25Ranker, TfIDFRanker
 from src.retriever import get_candidates, apply_seg_filter
 from src.ranker import rerank
-from src.generator  import answer
+from src.generator  import answer, is_question_hard_with_model
 
 def parse_args():
     p = argparse.ArgumentParser()
@@ -57,6 +57,15 @@ def main():
             q = input("\nAsk > ").strip()
             if q.lower() in {"exit","quit"}:
                 break
+
+
+            if is_question_hard_with_model(q, args.model_path):
+                model_path = "models/qwen2.5-1.5b-instruct-q5_k_m.gguf"
+                print("Question is hard. Using 1.5b model.")
+            else:
+                model_path = args.model_path
+                print("Question is easy. Using 0.5b model.")
+
             logger.log_query_start(q)
             cfg = planner.plan(q)
             index, chunks, sources, vectorizer, chunk_tags = \