Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,7 @@ marimo/_static/
marimo/_lsp/
__marimo__/

logs/
logs/

data/*
!data/README.md
412 changes: 122 additions & 290 deletions README.md

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion apps/eval-agent/.python-version

This file was deleted.

Empty file removed apps/eval-agent/README.md
Empty file.
14 changes: 0 additions & 14 deletions apps/eval-agent/pyproject.toml

This file was deleted.

7 changes: 0 additions & 7 deletions apps/eval-agent/src/eval_agent/__init__.py

This file was deleted.

Empty file.
2 changes: 1 addition & 1 deletion apps/prepare-benchmark/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Task:
metadata: MutableMapping[str, Any] = dataclasses.field(default_factory=dict)

def to_json(self) -> bytes:
return json.dumps(dataclasses.asdict(self)).encode()
return json.dumps(dataclasses.asdict(self), ensure_ascii=False).encode()

@classmethod
def from_json(cls, b: bytes):
Expand Down
25 changes: 24 additions & 1 deletion apps/prepare-benchmark/gen_browsecomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def decrypt(ciphertext_b64: str, password: str) -> str:
encrypted = base64.b64decode(ciphertext_b64)
key = derive_key(password, len(encrypted))
decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
return decrypted.decode()
return decrypted.decode("utf-8")


def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
Expand All @@ -53,3 +53,26 @@ def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
)
yield task
return


def gen_browsecomp_zh_test(hf_token: str) -> Generator[Task, None, None]:
dataset = load_dataset(
"PALIN2018/BrowseComp-ZH",
token=hf_token,
split="test",
)
for idx, x in enumerate(dataset):
metadata: MutableMapping = x
problem_encrypted = metadata.pop("Question")
answer_encrypted = metadata.pop("Answer")
canary = metadata.pop("canary")
metadata["Topic"] = decrypt(metadata["Topic"], canary)
task = Task(
task_id=str(idx),
task_question=decrypt(problem_encrypted, canary),
ground_truth=decrypt(answer_encrypted, canary),
file_path=None,
metadata=metadata,
)
yield task
return
75 changes: 75 additions & 0 deletions apps/prepare-benchmark/gen_hle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-FileCopyrightText: 2025 MiromindAI
#
# SPDX-License-Identifier: Apache-2.0

import base64
import pathlib
from typing import Generator, MutableMapping

from datasets import load_dataset

from common import Task


def save_image(image, data_dir: str, task_id: str) -> str:
if not image:
return None
# Ensure data_dir is absolute and resolved to avoid ugly .. in the path
data_dir_path = pathlib.Path(data_dir).resolve()
image_path = data_dir_path / "hle" / "images" / f"{task_id}.png"
image_path.parent.mkdir(parents=True, exist_ok=True)

# Handle different image formats
if isinstance(image, str):
# If it's a data URL, extract the base64 part
if image.startswith("data:"):
try:
header, b64data = image.split(",", 1)
image_data = base64.b64decode(b64data)
image_path.write_bytes(image_data)
except Exception as e:
raise ValueError(
f"Cannot process image data:<class 'str'> (data URL): {e}"
)
else:
try:
image_data = base64.b64decode(image)
image_path.write_bytes(image_data)
except Exception as e:
raise ValueError(
f"Cannot process image data:<class 'str'> (raw b64): {e}"
)
elif hasattr(image, "save"):
# If it's a PIL Image object
image.save(image_path)
else:
# Try to handle it as bytes directly
try:
image_path.write_bytes(image)
except Exception:
raise ValueError(f"Cannot process image data: {type(image)}")

return str(image_path)


def gen_hle_test(hf_token: str, data_dir: str) -> Generator[Task, None, None]:
dataset = load_dataset("cais/hle", split="test", token=hf_token)
for x in dataset:
metadata: MutableMapping = x # type: ignore
task_id = metadata.pop("id")
question = metadata.pop("question")
gt = metadata.pop("answer")
image = metadata.pop("image") # base64 encoded image
image_uri = save_image(image, data_dir, task_id)
metadata.pop("image_preview")
metadata.pop("rationale_image")
task = Task(
task_id=task_id,
task_question=question,
ground_truth=gt,
file_path=image_uri,
metadata=metadata,
)
yield task

return
19 changes: 18 additions & 1 deletion apps/prepare-benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
import dotenv
import fire

from gen_browsecomp import gen_browsecomp_test
from gen_browsecomp import gen_browsecomp_test, gen_browsecomp_zh_test
from gen_frames import gen_frames_test
from gen_gaia import gen_gaia_validation
from gen_gaia_text_only import gen_gaia_text_only
from gen_hle import gen_hle_test
from gen_webwalkerqa import gen_webwalkerqa


Expand All @@ -23,6 +24,8 @@ class _Env:
"frames-test",
"webwalkerqa",
"browsecomp-test",
"browsecomp-zh-test",
"hle",
)
meta_filename = "standardized_data.jsonl"
data_dir: pathlib.Path
Expand Down Expand Up @@ -56,6 +59,13 @@ def gen():
for x in gen_browsecomp_test(env.hf_token):
yield x

return gen
case "browsecomp-zh-test":

def gen():
for x in gen_browsecomp_zh_test(env.hf_token):
yield x

return gen
case "frames-test":

Expand All @@ -79,6 +89,13 @@ def gen():
for x in gen_webwalkerqa(env.hf_token):
yield x

return gen
case "hle":

def gen():
for x in gen_hle_test(env.hf_token, env.data_dir):
yield x

return gen
case _:
raise ValueError("not supported")
Expand Down
1 change: 1 addition & 0 deletions apps/prepare-benchmark/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"fire>=0.7.0",
"python-dotenv>=1.1.1",
"requests>=2.32.4",
"Pillow",
]

[dependency-groups]
Expand Down
Loading