Skip to content
Open

Neel #17

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"cmake.sourceDirectory": "/Users/neelb/Documents/TokenSmith/llama.cpp"
}
82 changes: 82 additions & 0 deletions generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os, subprocess, textwrap

LLAMA_CPP_BINARY = os.getenv("LLAMA_CPP_BIN", "/Users/aj/git/llama.cpp/build/bin/llama-cli")

ANSWER_START = "<<<ANSWER>>>"
ANSWER_END = "<<<END>>>"

def format_prompt(chunks, query, max_chunk_chars=400):
# smaller chunks = less repetition fuel
trimmed = [(c or "")[:max_chunk_chars] for c in chunks]
context = "\n\n".join(trimmed)
return textwrap.dedent(f"""\
<|im_start|>system
You are a concise tutor. Use the textbook excerpts to answer in 2–3 sentences.
If the excerpts are insufficient, say so briefly.
End your reply with {ANSWER_END}.
<|im_end|>
<|im_start|>user
Textbook Excerpts:
{context}

Question: {query}
<|im_end|>
<|im_start|>assistant
{ANSWER_START}
""")

def _extract_answer(raw: str) -> str:
# take everything after the last START marker, then cut at END
text = raw.split(ANSWER_START)[-1]
return text.split(ANSWER_END)[0].strip()

def run_llama_cpp(prompt: str, model_path: str, max_tokens: int = 300,
threads: int = 8, n_gpu_layers: int = 32, temperature: float = 0.3):
cmd = [
LLAMA_CPP_BINARY,
"-m", model_path,
"-p", prompt,
"-n", str(max_tokens),
"-t", str(threads),
"--n-gpu-layers", str(n_gpu_layers),
"--temp", str(temperature),
"--top-k", "40",
"--top-p", "0.9",
"--min-p", "0.05",
"--typical", "1.0",
"--repeat-penalty", "1.15",
"--repeat-last-n", "256",
"--mirostat", "2", # stabilizes length/quality
"--mirostat-ent", "5",
"--mirostat-lr", "0.1",
"--no-mmap",
"-no-cnv",
"-r", ANSWER_END, # hard stop at <<<END>>>
]
# capture BOTH streams (some builds stream tokens on stderr)
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env={**os.environ, "GGML_LOG_LEVEL": "ERROR", "LLAMA_LOG_LEVEL": "ERROR"},
)
out, _ = proc.communicate()
return _extract_answer(out or "")

def _dedupe_sentences(text: str) -> str:
# simple consecutive-sentence de-dupe
import re
sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
cleaned = []
for s in sents:
if not cleaned or s.lower() != cleaned[-1].lower():
cleaned.append(s)
return " ".join(cleaned)

def answer(query: str, chunks, model_path: str, max_tokens: int = 300, **kw):
prompt = format_prompt(chunks, query)
approx_tokens = max(1, len(prompt) // 4)
print(f"\n⚙️ Prompt length ≈ {approx_tokens} tokens\n")
raw = run_llama_cpp(prompt, model_path, max_tokens=max_tokens, **kw)
return _dedupe_sentences(raw)
1 change: 1 addition & 0 deletions llama.cpp
Submodule llama.cpp added at fbef0f
29 changes: 29 additions & 0 deletions src/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,32 @@ def answer(query: str, chunks, model_path: str, max_tokens: int = 300, **kw):
print(f"\n⚙️ Prompt length ≈ {approx_tokens} tokens\n")
raw = run_llama_cpp(prompt, model_path, max_tokens=max_tokens, **kw)
return _dedupe_sentences(raw)

def extract_answer_number(text: str) -> int | None:
print(f"Raw model output for difficulty rating: {text!r}")
m = re.search(r'Answer:\s*([1-5])', text)
if m:
return int(m.group(1))
return None

def is_question_hard_with_model(query: str, model_path: str, **kw) -> bool:
prompt = (
"""You are an expert in databases.
Rate the following question’s difficulty on a scale from 1 (easy) to 5 (hard), where:

1 = very easy (straightforward definition, fact recall, or yes/no question)
2 = easy (basic concept explanation or simple example)
3 = medium (requires some reasoning or combining multiple concepts)
4 = hard (multi-step reasoning, trade-offs, or applied problem-solving)
5 = very hard (open-ended design, analysis under constraints, or advanced research-level reasoning)"""
"Respond in the format Answer: <rating>\n\n"
f"Question: {query}\n"
"Answer:"
)
response = extract_answer_number(run_llama_cpp(prompt, model_path, max_tokens=3, temperature=0.1, **kw))
print(f"Response: {response!r}")

if not response:
return False
rating = int(response)
return rating >= 4
11 changes: 10 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from src.ranking.rankers import FaissSimilarityRanker, BM25Ranker, TfIDFRanker
from src.retriever import get_candidates, apply_seg_filter
from src.ranker import rerank
from src.generator import answer
from src.generator import answer, is_question_hard_with_model

def parse_args():
p = argparse.ArgumentParser()
Expand Down Expand Up @@ -57,6 +57,15 @@ def main():
q = input("\nAsk > ").strip()
if q.lower() in {"exit","quit"}:
break


if is_question_hard_with_model(q, args.model_path):
model_path = "models/qwen2.5-1.5b-instruct-q5_k_m.gguf"
print("Question is hard. Using 1.5b model.")
else:
model_path = args.model_path
print("Question is easy. Using 0.5b model.")

logger.log_query_start(q)
cfg = planner.plan(q)
index, chunks, sources, vectorizer, chunk_tags = \
Expand Down