Skip to content

Commit 15fb30a

Browse files
committed
camera-ready
1 parent f62e08b commit 15fb30a

File tree

10 files changed

+3469
-2704
lines changed

10 files changed

+3469
-2704
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,20 @@
1010

1111
[Paper](https://arxiv.org/abs/2412.21033) | [Website](https://gautierdag.github.io/plancraft/)
1212

13+
### Plancraft was accepted to COLM 2025!
14+
1315
Plancraft is a minecraft environment that benchmarks planning in LLM agents with an oracle RAG retriever.
1416

1517
You can install the package by running the following command:
1618

1719
```bash
18-
pip install plancraft
20+
uv add plancraft
1921
```
2022

21-
Or:
23+
Or
2224

2325
```bash
24-
uv add plancraft
26+
pip install plancraft
2527
```
2628

2729
![gif-example3](docs/images/train_images/TRAIN0010.gif)

configs/evals/gemma12b.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
plancraft:
2+
model: hosted_vllm/google/gemma-3-12b-it
3+
tokenizer: hosted_vllm/google/gemma-3-12b-it
4+
num_generations: 5
5+
mode: "act"
6+
max_steps: 30
7+
split: test
8+
max_message_window: 40
9+
resume: False
10+
output_dir: "outputs"
11+
wandb:
12+
project: "plancraft-new"
13+
entity: "itl"
14+
mode: "online"

configs/evals/gemma27b.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
plancraft:
2+
model: hosted_vllm/google/gemma-3-27b-it
3+
tokenizer: hosted_vllm/google/gemma-3-27b-it
4+
num_generations: 5
5+
mode: "act"
6+
max_steps: 30
7+
split: test
8+
max_message_window: 40
9+
resume: False
10+
output_dir: "outputs"
11+
wandb:
12+
project: "plancraft-new"
13+
entity: "itl"
14+
mode: "online"

configs/evals/llama70B.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
plancraft:
2-
model: meta-llama/Llama-3.3-70B-Instruct
3-
tokenizer: meta-llama/Llama-3.3-70B-Instruct
2+
model: hosted_vllm/meta-llama/Llama-3.3-70B-Instruct
3+
tokenizer: hosted_vllm/meta-llama/Llama-3.3-70B-Instruct
44
num_generations: 5
55
mode: "act"
66
max_steps: 30

configs/evals/qwen3.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
plancraft:
2+
model: hosted_vllm/Qwen/Qwen3-30B-A3B
3+
tokenizer: hosted_vllm/Qwen/Qwen3-30B-A3B
4+
num_generations: 3
5+
mode: "act"
6+
max_steps: 30
7+
split: test
8+
max_message_window: 40
9+
resume: False
10+
output_dir: "outputs"
11+
wandb:
12+
project: "plancraft-new"
13+
entity: "itl"
14+
mode: "online"

configs/evals/qwen72BVL.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
plancraft:
2+
model: hosted_vllm/Qwen/Qwen2.5-VL-72B-Instruct
3+
tokenizer: hosted_vllm/Qwen/Qwen2.5-VL-72B-Instruct
4+
num_generations: 1
5+
mode: "act"
6+
max_steps: 30
7+
split: test
8+
max_message_window: 40
9+
resume: False
10+
output_dir: "outputs"
11+
wandb:
12+
project: "plancraft-new"
13+
entity: "itl"
14+
mode: "online"

plancraft/models/act.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
OpenAIGenerator,
88
TransformersGenerator,
99
VLLMGenerator,
10+
LiteLLMGenerator,
1011
)
1112
from plancraft.utils import History
1213

@@ -34,7 +35,12 @@ def __init__(self, cfg: EvalConfig):
3435
self.bbox_model.cuda()
3536

3637
# underlying language model
37-
if "gpt-4o" in cfg.plancraft.model:
38+
if "hosted_vllm" in cfg.plancraft.model:
39+
self.llm = LiteLLMGenerator(
40+
cfg.plancraft.model,
41+
use_images=self.use_images,
42+
)
43+
elif "gpt-4o" in cfg.plancraft.model:
3844
self.use_multimodal_content_format = True
3945
self.llm = OpenAIGenerator(
4046
use_images=self.use_images,

plancraft/models/generators.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@
1010
AutoTokenizer,
1111
BitsAndBytesConfig,
1212
)
13+
import litellm
14+
import logging
15+
from litellm import completion
16+
17+
litellm._logging._disable_debugging()
18+
loggers = [
19+
"LiteLLM Proxy",
20+
"LiteLLM Router",
21+
"LiteLLM",
22+
"httpx"
23+
]
24+
for logger_name in loggers:
25+
logger = logging.getLogger(logger_name)
26+
logger.setLevel(logging.CRITICAL + 1)
27+
1328

1429
try:
1530
from vllm import LLM, SamplingParams
@@ -401,3 +416,96 @@ def generate_unconstrained(
401416
tokens_used += response.usage.total_tokens
402417
contents.append(content)
403418
return contents, tokens_used
419+
420+
421+
class LiteLLMGenerator:
422+
def __init__(self, model_name, use_images=False):
423+
self.use_images = use_images
424+
self.model_name = model_name
425+
self.api_base = (
426+
"http://0.0.0.0:8000/v1" if "hosted_vllm" in self.model_name else None
427+
)
428+
429+
def reset(self):
430+
pass
431+
432+
def prepare_messages(
433+
self,
434+
history: History,
435+
max_messages_window: int,
436+
) -> tuple[list[dict], list]:
437+
"""
438+
Prepare the image messages for the model
439+
"""
440+
message_window = history.dialogue_history[-max_messages_window:]
441+
# remove the first assistant message if it is present
442+
if len(message_window) > 0 and message_window[0]["role"] == "assistant":
443+
message_window = message_window[1:]
444+
# add the system prompt if the first message is not a system message
445+
if message_window[0]["role"] != "system":
446+
message_window = [history.system_prompt_dialogue] + message_window
447+
448+
if self.use_images:
449+
450+
message_window = copy.deepcopy(message_window)
451+
# copy the images to the history
452+
img_idx = -1
453+
seen_images = 0
454+
# iterate through the messages in reverse order to assign images
455+
for i in range(len(message_window) - 1, -1, -1):
456+
new_content_list = []
457+
for content in message_window[i]["content"]:
458+
if content["type"] == "text":
459+
new_content_list.append(content)
460+
elif content["type"] == "image":
461+
base64_image = numpy_to_base64(history.images[img_idx])
462+
img_idx -= 1
463+
seen_images + 1
464+
new_content = {
465+
"type": "image_url",
466+
"image_url": {
467+
"url": f"data:image/jpeg;base64,{base64_image}"
468+
},
469+
}
470+
new_content_list.append(new_content)
471+
message_window[i]["content"] = new_content_list
472+
assert seen_images <= len(history.images), "Too many images"
473+
474+
return message_window, []
475+
476+
def generate_unconstrained(
477+
self,
478+
batch_messages: list[list[dict]],
479+
max_tokens=256,
480+
temperature=0.6,
481+
**kwargs,
482+
) -> tuple[list[str], int]:
483+
contents = []
484+
tokens_used = 0
485+
for messages in batch_messages:
486+
response = completion(
487+
model=self.model_name,
488+
messages=messages,
489+
temperature=temperature,
490+
max_tokens=max_tokens,
491+
top_p=1,
492+
frequency_penalty=0,
493+
presence_penalty=0,
494+
stop=["\n", "\n\n"],
495+
api_base=self.api_base,
496+
)
497+
content = response.choices[0].message.content
498+
content = self.clear_thinking_tokens(content)
499+
tokens_used += response.usage.total_tokens
500+
contents.append(content)
501+
502+
return contents, tokens_used
503+
504+
@staticmethod
505+
def clear_thinking_tokens(content: str) -> str:
506+
"""
507+
remove the thinking <think> text from the model.
508+
"""
509+
if "</think>" in content:
510+
content = content.split("</think>")[-1]
511+
return content.strip()

pyproject.toml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dependencies = [
2424
]
2525

2626
[tool.uv]
27+
no-build-isolation-package = ['flash-attn']
2728
dev-dependencies = [
2829
"ipykernel>=6.29.5",
2930
"ipython>=7.5.0",
@@ -39,14 +40,17 @@ full = [
3940
"hf-transfer",
4041
"matplotlib",
4142
"seaborn",
43+
"einops",
44+
"huggingface-hub",
4245
"torch>=2.5.0",
4346
"torchvision>=0.20.0",
44-
"transformers>=4.43.3",
45-
"vllm>=0.7.3",
46-
"accelerate",
47-
"peft",
4847
"einops",
4948
"huggingface-hub",
49+
"transformers==4.52.3",
50+
"flash-attn>=2.7.4.post1",
51+
"flashinfer-python==0.2.2; sys_platform != 'darwin'",
52+
"litellm>=1.71.1",
53+
"vllm>=0.8.5",
5054
]
5155

5256
[tool.setuptools.package-dir]

0 commit comments

Comments
 (0)