Skip to content

Commit 0417ebc

Browse files
committed
feat(code-update): code improvements
- Enhanced README and documentation - Added new benchmark configurations and scripts - Updated LLM provider configurations - Added new agent configurations for various models - Improved tool and utility functions - Added new evaluation and analysis tools
1 parent 1a83bbe commit 0417ebc

File tree

124 files changed

+5817
-1518
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+5817
-1518
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,4 +210,7 @@ marimo/_static/
210210
marimo/_lsp/
211211
__marimo__/
212212

213-
logs/
213+
logs/
214+
215+
data/*
216+
!data/README.md

README.md

Lines changed: 122 additions & 290 deletions
Large diffs are not rendered by default.

apps/eval-agent/.python-version

Lines changed: 0 additions & 1 deletion
This file was deleted.

apps/eval-agent/README.md

Whitespace-only changes.

apps/eval-agent/pyproject.toml

Lines changed: 0 additions & 14 deletions
This file was deleted.

apps/eval-agent/src/eval_agent/__init__.py

Lines changed: 0 additions & 7 deletions
This file was deleted.

apps/eval-agent/src/eval_agent/py.typed

Whitespace-only changes.

apps/prepare-benchmark/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class Task:
1818
metadata: MutableMapping[str, Any] = dataclasses.field(default_factory=dict)
1919

2020
def to_json(self) -> bytes:
21-
return json.dumps(dataclasses.asdict(self)).encode()
21+
return json.dumps(dataclasses.asdict(self), ensure_ascii=False).encode()
2222

2323
@classmethod
2424
def from_json(cls, b: bytes):

apps/prepare-benchmark/gen_browsecomp.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def decrypt(ciphertext_b64: str, password: str) -> str:
3030
encrypted = base64.b64decode(ciphertext_b64)
3131
key = derive_key(password, len(encrypted))
3232
decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
33-
return decrypted.decode()
33+
return decrypted.decode("utf-8")
3434

3535

3636
def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
@@ -53,3 +53,26 @@ def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
5353
)
5454
yield task
5555
return
56+
57+
58+
def gen_browsecomp_zh_test(hf_token: str) -> Generator[Task, None, None]:
59+
dataset = load_dataset(
60+
"PALIN2018/BrowseComp-ZH",
61+
token=hf_token,
62+
split="test",
63+
)
64+
for idx, x in enumerate(dataset):
65+
metadata: MutableMapping = x
66+
problem_encrypted = metadata.pop("Question")
67+
answer_encrypted = metadata.pop("Answer")
68+
canary = metadata.pop("canary")
69+
metadata["Topic"] = decrypt(metadata["Topic"], canary)
70+
task = Task(
71+
task_id=str(idx),
72+
task_question=decrypt(problem_encrypted, canary),
73+
ground_truth=decrypt(answer_encrypted, canary),
74+
file_path=None,
75+
metadata=metadata,
76+
)
77+
yield task
78+
return

apps/prepare-benchmark/gen_hle.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# SPDX-FileCopyrightText: 2025 MiromindAI
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import base64
6+
import pathlib
7+
from typing import Generator, MutableMapping
8+
9+
from datasets import load_dataset
10+
11+
from common import Task
12+
13+
14+
def save_image(image, data_dir: str, task_id: str) -> str:
15+
if not image:
16+
return None
17+
# Ensure data_dir is absolute and resolved to avoid ugly .. in the path
18+
data_dir_path = pathlib.Path(data_dir).resolve()
19+
image_path = data_dir_path / "hle" / "images" / f"{task_id}.png"
20+
image_path.parent.mkdir(parents=True, exist_ok=True)
21+
22+
# Handle different image formats
23+
if isinstance(image, str):
24+
# If it's a data URL, extract the base64 part
25+
if image.startswith("data:"):
26+
try:
27+
header, b64data = image.split(",", 1)
28+
image_data = base64.b64decode(b64data)
29+
image_path.write_bytes(image_data)
30+
except Exception as e:
31+
raise ValueError(
32+
f"Cannot process image data:<class 'str'> (data URL): {e}"
33+
)
34+
else:
35+
try:
36+
image_data = base64.b64decode(image)
37+
image_path.write_bytes(image_data)
38+
except Exception as e:
39+
raise ValueError(
40+
f"Cannot process image data:<class 'str'> (raw b64): {e}"
41+
)
42+
elif hasattr(image, "save"):
43+
# If it's a PIL Image object
44+
image.save(image_path)
45+
else:
46+
# Try to handle it as bytes directly
47+
try:
48+
image_path.write_bytes(image)
49+
except Exception:
50+
raise ValueError(f"Cannot process image data: {type(image)}")
51+
52+
return str(image_path)
53+
54+
55+
def gen_hle_test(hf_token: str, data_dir: str) -> Generator[Task, None, None]:
56+
dataset = load_dataset("cais/hle", split="test", token=hf_token)
57+
for x in dataset:
58+
metadata: MutableMapping = x # type: ignore
59+
task_id = metadata.pop("id")
60+
question = metadata.pop("question")
61+
gt = metadata.pop("answer")
62+
image = metadata.pop("image") # base64 encoded image
63+
image_uri = save_image(image, data_dir, task_id)
64+
metadata.pop("image_preview")
65+
metadata.pop("rationale_image")
66+
task = Task(
67+
task_id=task_id,
68+
task_question=question,
69+
ground_truth=gt,
70+
file_path=image_uri,
71+
metadata=metadata,
72+
)
73+
yield task
74+
75+
return

0 commit comments

Comments
 (0)