|
| 1 | +import tomllib |
| 2 | +import time |
| 3 | +import os |
| 4 | +from pathlib import Path |
| 5 | +from llama_cpp import Llama |
| 6 | +from qwen_agent import Assistant |
| 7 | +from agent.tools import build_tools |
| 8 | + |
| 9 | +CFG_PATH = os.environ.get("AGENT_CONFIG", Path(__file__).with_suffix('.toml')) |
| 10 | +CFG = tomllib.loads(Path(CFG_PATH).read_text()) |
| 11 | + |
| 12 | +os.environ["OMP_NUM_THREADS"] = str(CFG["performance"]["threads"]) |
| 13 | + |
| 14 | +llm = Llama( |
| 15 | + model_path=CFG["model"]["gguf_path"], |
| 16 | + n_ctx=CFG["model"]["context_length"], |
| 17 | + n_gpu_layers=CFG["performance"]["gpu_layers"], |
| 18 | + temperature=CFG["model"]["temperature"], |
| 19 | + top_p=CFG["model"]["top_p"], |
| 20 | + top_k=CFG["model"]["top_k"], |
| 21 | + logits_all=False, |
| 22 | +) |
| 23 | + |
| 24 | +assistant = Assistant( |
| 25 | + llm=llm, |
| 26 | + function_list=build_tools(CFG["tools"], CFG.get("paths", {})), |
| 27 | + enable_thinking=CFG["model"]["enable_thinking"], |
| 28 | +) |
| 29 | + |
| 30 | +def chat(prompt: str) -> str: |
| 31 | + start = time.perf_counter() |
| 32 | + rsp = assistant.chat(prompt)["content"] |
| 33 | + print(f"[{(time.perf_counter()-start):.2f}s]") |
| 34 | + return rsp |
| 35 | + |
| 36 | +if __name__ == "__main__": |
| 37 | + while True: |
| 38 | + try: |
| 39 | + print(chat(input(">>> "))) |
| 40 | + except (EOFError, KeyboardInterrupt): |
| 41 | + break |
0 commit comments