Skip to content

Commit 2f749f5

Browse files
merge branch main
2 parents b5db0d3 + 221c1e4 commit 2f749f5

File tree

7 files changed

+307
-146
lines changed

7 files changed

+307
-146
lines changed

examples/inference_transformers.ipynb

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,26 @@
6262
],
6363
"source": [
6464
"from llmsql import inference_transformers\n",
65-
"results = inference_transformers(model_or_model_name_or_path=\"EleutherAI/pythia-14m\", output_file=\"test_output.jsonl\", batch_size=5000, do_sample=False)"
65+
"\n",
66+
"# Example 1: Basic usage (same as before)\n",
67+
"results = inference_transformers(\n",
68+
" model_or_model_name_or_path=\"EleutherAI/pythia-14m\",\n",
69+
" output_file=\"test_output.jsonl\",\n",
70+
" batch_size=5000,\n",
71+
" do_sample=False,\n",
72+
")\n",
73+
"\n",
74+
"# # Example 2: Using the new kwargs for advanced options\n",
75+
"# results = inference_transformers(\n",
76+
"# model_or_model_name_or_path=\"EleutherAI/pythia-14m\",\n",
77+
"# output_file=\"test_output.jsonl\",\n",
78+
"# batch_size=5000,\n",
79+
"# do_sample=False,\n",
80+
"# # Advanced model loading options\n",
81+
"# model_kwargs={\"low_cpu_mem_usage\": True, \"attn_implementation\": \"flash_attention_2\"},\n",
82+
"# # Advanced generation options\n",
83+
"# generation_kwargs={\"repetition_penalty\": 1.1, \"length_penalty\": 1.0},\n",
84+
"# )"
6685
]
6786
}
6887
],

examples/inference_vllm.ipynb

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@
173173
},
174174
{
175175
"cell_type": "code",
176-
"execution_count": 1,
176+
"execution_count": null,
177177
"id": "edc910ac",
178178
"metadata": {},
179179
"outputs": [
@@ -333,12 +333,34 @@
333333
],
334334
"source": [
335335
"from llmsql import inference_vllm\n",
336+
"\n",
337+
"# Basic usage (backward compatible)\n",
336338
"results = inference_vllm(\n",
337-
" \"Qwen/Qwen2.5-1.5B-Instruct\",\n",
338-
" \"test_results.jsonl\",\n",
339+
" model_name=\"EleutherAI/pythia-14m\",\n",
340+
" output_file=\"test_output.jsonl\",\n",
341+
" batch_size=5000,\n",
339342
" do_sample=False,\n",
340-
" batch_size=20000\n",
341-
")"
343+
")\n",
344+
"\n",
345+
"# # Advanced usage with new kwargs\n",
346+
"# results = inference_vllm(\n",
347+
"# model_name=\"EleutherAI/pythia-14m\",\n",
348+
"# output_file=\"test_output.jsonl\",\n",
349+
"# batch_size=5000,\n",
350+
"# do_sample=False,\n",
351+
"# # vLLM-specific options\n",
352+
"# llm_kwargs={\n",
353+
"# \"gpu_memory_utilization\": 0.9,\n",
354+
"# \"max_model_len\": 4096,\n",
355+
"# \"quantization\": \"awq\",\n",
356+
"# },\n",
357+
"# # Advanced sampling options\n",
358+
"# sampling_kwargs={\n",
359+
"# \"top_p\": 0.95,\n",
360+
"# \"frequency_penalty\": 0.1,\n",
361+
"# \"presence_penalty\": 0.1,\n",
362+
"# },\n",
363+
"# )"
342364
]
343365
},
344366
{

llmsql/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
LLMSQL — A Text2SQL benchmark for evaluation of Large Language Models
33
"""
44

5-
__version__ = "0.1.11"
5+
__version__ = "0.1.13"
66

77

88
def __getattr__(name: str): # type: ignore

llmsql/inference/README.md

Lines changed: 80 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -58,19 +58,10 @@ results = inference_transformers(
5858
from llmsql import inference_vllm
5959

6060
results = inference_vllm(
61-
model_name="Qwen/Qwen2.5-1.5B-Instruct",
62-
output_file="outputs/preds_vllm.jsonl",
63-
questions_path="data/questions.jsonl",
64-
tables_path="data/tables.jsonl",
65-
num_fewshots=5,
66-
batch_size=8,
67-
max_new_tokens=256,
61+
model_name="EleutherAI/pythia-14m",
62+
output_file="test_output.jsonl",
63+
batch_size=5000,
6864
do_sample=False,
69-
llm_kwargs={
70-
"tensor_parallel_size": 1,
71-
"gpu_memory_utilization": 0.9,
72-
"max_model_len": 4096,
73-
},
7465
)
7566
```
7667

@@ -96,8 +87,7 @@ llmsql inference --method transformers \
9687
--model-or-model-name-or-path Qwen/Qwen2.5-1.5B-Instruct \
9788
--output-file outputs/preds.jsonl \
9889
--batch-size 8 \
99-
--temperature 0.9 \
100-
--generate-kwargs '{"do_sample": false, "top_p": 0.95}'
90+
--temperature 0.0 \
10191
```
10292

10393
👉 Run `llmsql inference --help` for more detailed examples and parameter options.
@@ -112,18 +102,49 @@ Runs inference using the Hugging Face `transformers` backend.
112102

113103
**Parameters:**
114104

115-
| Argument | Type | Description |
116-
| ------------------------------- | ------- | -------------------------------------------------------------- |
117-
| `model_or_model_name_or_path` | `str` | Model name or local path (any causal LM). |
118-
| `output_file` | `str` | Path to write predictions as JSONL. |
119-
| `questions_path`, `tables_path` | `str` | Benchmark files (auto-downloaded if missing). |
120-
| `num_fewshots` | `int` | Number of few-shot examples (0, 1, 5). |
121-
| `batch_size` | `int` | Batch size for inference. |
122-
| `max_new_tokens` | `int` | Maximum length of generated SQL queries. |
123-
| `temperature` | `float` | Sampling temperature. |
124-
| `do_sample` | `bool` | Whether to use sampling. |
125-
| `model_args` | `dict` | Extra kwargs passed to `AutoModelForCausalLM.from_pretrained`. |
126-
| `generate_kwargs` | `dict` | Extra kwargs passed to `model.generate()`. |
105+
#### Model Loading
106+
107+
| Argument | Type | Default | Description |
108+
| ------------------------------- | --------------------- | ------------- | -------------------------------------------------------------- |
109+
| `model_or_model_name_or_path` | `str \| AutoModelForCausalLM` | *required* | Model object, HuggingFace model name, or local path. |
110+
| `tokenizer_or_name` | `str \| Any \| None` | `None` | Tokenizer object, name, or None (infers from model). |
111+
| `trust_remote_code` | `bool` | `True` | Whether to trust remote code when loading models. |
112+
| `dtype` | `torch.dtype` | `torch.float16` | Model precision (e.g., `torch.float16`, `torch.bfloat16`). |
113+
| `device_map` | `str \| dict \| None` | `"auto"` | Device placement strategy for multi-GPU. |
114+
| `hf_token` | `str \| None` | `None` | Hugging Face authentication token. |
115+
| `model_kwargs` | `dict \| None` | `None` | Additional kwargs for `AutoModelForCausalLM.from_pretrained()`. |
116+
| `tokenizer_kwargs` | `dict \| None` | `None` | Additional kwargs for `AutoTokenizer.from_pretrained()`. |
117+
118+
#### Prompt & Chat
119+
120+
| Argument | Type | Default | Description |
121+
| ------------------------------- | -------------- | ------- | ------------------------------------------------ |
122+
| `chat_template` | `str \| None` | `None` | Optional chat template string to apply. |
123+
124+
#### Generation
125+
126+
| Argument | Type | Default | Description |
127+
| ------------------------------- | -------------- | ------- | ------------------------------------------------ |
128+
| `max_new_tokens` | `int` | `256` | Maximum tokens to generate per sequence. |
129+
| `temperature` | `float` | `0.0` | Sampling temperature (0.0 = greedy). |
130+
| `do_sample` | `bool` | `False` | Whether to use sampling vs greedy decoding. |
131+
| `top_p` | `float` | `1.0` | Nucleus sampling parameter. |
132+
| `top_k` | `int` | `50` | Top-k sampling parameter. |
133+
| `generation_kwargs` | `dict \| None` | `None` | Additional kwargs for `model.generate()`. |
134+
135+
#### Benchmark
136+
137+
| Argument | Type | Default | Description |
138+
| ------------------------------- | ------- | ------------------------- | ------------------------------------------------ |
139+
| `output_file` | `str` | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL. |
140+
| `questions_path` | `str \| None` | `None` | Path to questions.jsonl (auto-downloads if missing). |
141+
| `tables_path` | `str \| None` | `None` | Path to tables.jsonl (auto-downloads if missing). |
142+
| `workdir_path` | `str` | `"llmsql_workdir"` | Working directory for downloaded files. |
143+
| `num_fewshots` | `int` | `5` | Number of few-shot examples (0, 1, or 5). |
144+
| `batch_size` | `int` | `8` | Batch size for inference. |
145+
| `seed` | `int` | `42` | Random seed for reproducibility. |
146+
147+
**Note:** Explicit parameters (e.g., `dtype`, `trust_remote_code`) override any values specified in `model_kwargs` or `tokenizer_kwargs`.
127148

128149
---
129150

@@ -133,18 +154,39 @@ Runs inference using the [vLLM](https://github.com/vllm-project/vllm) backend fo
133154

134155
**Parameters:**
135156

136-
| Argument | Type | Description |
137-
| ------------------------------- | ------- | ------------------------------------------------ |
138-
| `model_name` | `str` | Hugging Face model name or path. |
139-
| `output_file` | `str` | Path to write predictions as JSONL. |
140-
| `questions_path`, `tables_path` | `str` | Benchmark files (auto-downloaded if missing). |
141-
| `num_fewshots` | `int` | Number of few-shot examples (0, 1, 5). |
142-
| `batch_size` | `int` | Number of prompts per batch. |
143-
| `max_new_tokens` | `int` | Maximum tokens per generation. |
144-
| `temperature` | `float` | Sampling temperature. |
145-
| `do_sample` | `bool` | Whether to sample or use greedy decoding. |
146-
| `llm_kwargs` | `dict` | Extra kwargs forwarded to `vllm.LLM`. |
147-
| `sampling_kwargs` | `dict` | Extra kwargs forwarded to `vllm.SamplingParams`. |
157+
#### Model Loading
158+
159+
| Argument | Type | Default | Description |
160+
| ------------------------------- | -------------- | ------- | ------------------------------------------------ |
161+
| `model_name` | `str` | *required* | Hugging Face model name or local path. |
162+
| `trust_remote_code` | `bool` | `True` | Whether to trust remote code when loading. |
163+
| `tensor_parallel_size` | `int` | `1` | Number of GPUs for tensor parallelism. |
164+
| `hf_token` | `str \| None` | `None` | Hugging Face authentication token. |
165+
| `llm_kwargs` | `dict \| None` | `None` | Additional kwargs for `vllm.LLM()`. |
166+
| `llm_kwargs` | `bool` | `True` | Whether to use chat template of the tokenizer |
167+
168+
#### Generation
169+
170+
| Argument | Type | Default | Description |
171+
| ------------------------------- | -------------- | ------- | ------------------------------------------------ |
172+
| `max_new_tokens` | `int` | `256` | Maximum tokens to generate per sequence. |
173+
| `temperature` | `float` | `1.0` | Sampling temperature (0.0 = greedy). |
174+
| `do_sample` | `bool` | `True` | Whether to use sampling vs greedy decoding. |
175+
| `sampling_kwargs` | `dict \| None` | `None` | Additional kwargs for `vllm.SamplingParams()`. |
176+
177+
#### Benchmark
178+
179+
| Argument | Type | Default | Description |
180+
| ------------------------------- | -------------- | ----------------------------- | ------------------------------------------------ |
181+
| `output_file` | `str` | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL. |
182+
| `questions_path` | `str \| None` | `None` | Path to questions.jsonl (auto-downloads if missing). |
183+
| `tables_path` | `str \| None` | `None` | Path to tables.jsonl (auto-downloads if missing). |
184+
| `workdir_path` | `str` | `"llmsql_workdir"` | Working directory for downloaded files. |
185+
| `num_fewshots` | `int` | `5` | Number of few-shot examples (0, 1, or 5). |
186+
| `batch_size` | `int` | `8` | Number of prompts per batch. |
187+
| `seed` | `int` | `42` | Random seed for reproducibility. |
188+
189+
**Note:** Explicit parameters (e.g., `tensor_parallel_size`, `trust_remote_code`) override any values specified in `llm_kwargs` or `sampling_kwargs`.
148190

149191
---
150192

0 commit comments

Comments
 (0)