🎉 add readme and fix default engine

saichandrapandraju · saichandrapandraju · commit c84a76839707 · 2025-05-29T12:12:26.000-07:00
diff --git a/README.md b/README.md
@@ -1 +1,84 @@
-# vllm-judge
+# vLLM Judge
+
+A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
+
+## Features
+
+- 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
+- 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
+- 🔧 **Template Support**: Dynamic evaluations with template variables
+- ⚡ **High Performance**: Optimized for vLLM with automatic batching
+- 🌐 **API Mode**: Run as a REST API service
+- 🔄 **Async Native**: Built for high-throughput evaluations
+
+## Installation
+
+```bash
+# Basic installation
+pip install vllm_judge
+
+# With API support
+pip install vllm_judge[api]
+
+# With Jinja2 template support
+pip install vllm_judge[jinja2]
+
+# Everything
+pip install vllm_judge[api,jinja2]
+```
+
+## Quick Start
+
+```python
+from vllm_judge import Judge
+
+# Initialize with vLLM url
+judge = await Judge.from_url("http://localhost:8000")
+
+# Simple evaluation
+result = await judge.evaluate(
+    response="The Earth orbits around the Sun.",
+    criteria="scientific accuracy"
+)
+print(f"Decision: {result.decision}")
+print(f"Reasoning: {result.reasoning}")
+
+# Using pre-built metrics
+from vllm_judge import CODE_QUALITY
+
+result = await judge.evaluate(
+    response="def add(a, b): return a + b",
+    metric=CODE_QUALITY
+)
+
+# With template variables
+result = await judge.evaluate(
+    response="Essay content here...",
+    criteria="Evaluate this {doc_type} for {audience}",
+    template_vars={
+        "doc_type": "essay",
+        "audience": "high school students"
+    }
+)
+```
+
+## API Server
+
+Run Judge as a REST API:
+
+```bash
+vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
+```
+
+Then use the HTTP API:
+
+```python
+from vllm_judge.api import JudgeClient
+
+client = JudgeClient("http://localhost:9090")
+result = await client.evaluate(
+    response="Python is great!",
+    criteria="technical accuracy"
+)
+```
+
diff --git a/examples/basic_test.ipynb b/examples/basic_test.ipynb
@@ -196,6 +196,72 @@
     "res.model_dump()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm_judge.api import JudgeClient\n",
+    "\n",
+    "client = JudgeClient(\"http://localhost:9090\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'status': 'healthy',\n",
+       " 'version': '0.1.0',\n",
+       " 'model': 'qwen2',\n",
+       " 'base_url': 'http://localhost:8080',\n",
+       " 'uptime_seconds': 62.64390587806702,\n",
+       " 'total_evaluations': 1,\n",
+       " 'active_connections': 0,\n",
+       " 'metrics_available': 24}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await client.health_check()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'decision': False,\n",
+       " 'reasoning': 'The response lacks technical detail and does not provide a substantive explanation of why Python is great.',\n",
+       " 'score': None,\n",
+       " 'metadata': {'model': 'qwen2',\n",
+       "  'raw_response': '{\\n    \"decision\": false,\\n    \"reasoning\": \"The response lacks technical detail and does not provide a substantive explanation of why Python is great.\",\\n    \"score\": null\\n}'}}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = await client.evaluate(\n",
+    "    response=\"Python is great!\",\n",
+    "    criteria=\"technical accuracy\"\n",
+    ")\n",
+    "result.model_dump() "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/src/vllm_judge/api/client.py b/src/vllm_judge/api/client.py
@@ -74,7 +74,7 @@ async def evaluate(
         system_prompt: str = None,
         examples: List[Dict[str, Any]] = None,
         template_vars: Dict[str, Any] = None,
-        template_engine: str = None,
+        template_engine: str = "format",
         **kwargs
     ) -> EvaluationResult:
         """
diff --git a/src/vllm_judge/api/server.py b/src/vllm_judge/api/server.py
@@ -69,7 +69,7 @@ async def vllm_judge_exception_handler(request, exc: VLLMJudgeError):
             error=exc.__class__.__name__,
             detail=str(exc),
             code="VLLM_JUDGE_ERROR"
-        ).dict()
+        ).model_dump()
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ async def vllm_judge_exception_handler(request, exc: VLLMJudgeError):`
`69`	`69`	`error=exc.__class__.__name__,`
`70`	`70`	`detail=str(exc),`
`71`	`71`	`code="VLLM_JUDGE_ERROR"`
`72`		`- ).dict()`
	`72`	`+ ).model_dump()`
`73`	`73`	`)`
`74`	`74`
`75`	`75`