open-compass
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 122 additions & 290 deletions b/‎README.md‎
Lines changed: 122 additions & 290 deletions
diff --git a/‎apps/eval-agent/.python-version‎
Lines changed: 0 additions & 1 deletion b/‎apps/eval-agent/.python-version‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎apps/eval-agent/README.md‎ b/‎apps/eval-agent/README.md‎
diff --git a/‎apps/eval-agent/pyproject.toml‎
Lines changed: 0 additions & 14 deletions b/‎apps/eval-agent/pyproject.toml‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎apps/eval-agent/src/eval_agent/__init__.py‎
Lines changed: 0 additions & 7 deletions b/‎apps/eval-agent/src/eval_agent/__init__.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎apps/eval-agent/src/eval_agent/py.typed‎ b/‎apps/eval-agent/src/eval_agent/py.typed‎
diff --git a/‎apps/prepare-benchmark/common.py‎
Lines changed: 1 addition & 1 deletion b/‎apps/prepare-benchmark/common.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/prepare-benchmark/gen_browsecomp.py‎
Lines changed: 24 additions & 1 deletion b/‎apps/prepare-benchmark/gen_browsecomp.py‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎apps/prepare-benchmark/gen_hle.py‎
Lines changed: 75 additions & 0 deletions b/‎apps/prepare-benchmark/gen_hle.py‎
Lines changed: 75 additions & 0 deletions
@@ -210,4 +210,7 @@ marimo/_static/
 marimo/_lsp/
 __marimo__/
 
-logs/
+logs/
+
+data/*
+!data/README.md
@@ -18,7 +18,7 @@ class Task:
     metadata: MutableMapping[str, Any] = dataclasses.field(default_factory=dict)
 
     def to_json(self) -> bytes:
-        return json.dumps(dataclasses.asdict(self)).encode()
+        return json.dumps(dataclasses.asdict(self), ensure_ascii=False).encode()
 
     @classmethod
     def from_json(cls, b: bytes):
 
@@ -30,7 +30,7 @@ def decrypt(ciphertext_b64: str, password: str) -> str:
     encrypted = base64.b64decode(ciphertext_b64)
     key = derive_key(password, len(encrypted))
     decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
-    return decrypted.decode()
+    return decrypted.decode("utf-8")
 
 
 def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
@@ -53,3 +53,26 @@ def gen_browsecomp_test(hf_token: str) -> Generator[Task, None, None]:
         )
         yield task
     return
+
+
+def gen_browsecomp_zh_test(hf_token: str) -> Generator[Task, None, None]:
+    dataset = load_dataset(
+        "PALIN2018/BrowseComp-ZH",
+        token=hf_token,
+        split="test",
+    )
+    for idx, x in enumerate(dataset):
+        metadata: MutableMapping = x
+        problem_encrypted = metadata.pop("Question")
+        answer_encrypted = metadata.pop("Answer")
+        canary = metadata.pop("canary")
+        metadata["Topic"] = decrypt(metadata["Topic"], canary)
+        task = Task(
+            task_id=str(idx),
+            task_question=decrypt(problem_encrypted, canary),
+            ground_truth=decrypt(answer_encrypted, canary),
+            file_path=None,
+            metadata=metadata,
+        )
+        yield task
+    return
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import pathlib
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from common import Task
+
+
+def save_image(image, data_dir: str, task_id: str) -> str:
+    if not image:
+        return None
+    # Ensure data_dir is absolute and resolved to avoid ugly .. in the path
+    data_dir_path = pathlib.Path(data_dir).resolve()
+    image_path = data_dir_path / "hle" / "images" / f"{task_id}.png"
+    image_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Handle different image formats
+    if isinstance(image, str):
+        # If it's a data URL, extract the base64 part
+        if image.startswith("data:"):
+            try:
+                header, b64data = image.split(",", 1)
+                image_data = base64.b64decode(b64data)
+                image_path.write_bytes(image_data)
+            except Exception as e:
+                raise ValueError(
+                    f"Cannot process image data:<class 'str'> (data URL): {e}"
+                )
+        else:
+            try:
+                image_data = base64.b64decode(image)
+                image_path.write_bytes(image_data)
+            except Exception as e:
+                raise ValueError(
+                    f"Cannot process image data:<class 'str'> (raw b64): {e}"
+                )
+    elif hasattr(image, "save"):
+        # If it's a PIL Image object
+        image.save(image_path)
+    else:
+        # Try to handle it as bytes directly
+        try:
+            image_path.write_bytes(image)
+        except Exception:
+            raise ValueError(f"Cannot process image data: {type(image)}")
+
+    return str(image_path)
+
+
+def gen_hle_test(hf_token: str, data_dir: str) -> Generator[Task, None, None]:
+    dataset = load_dataset("cais/hle", split="test", token=hf_token)
+    for x in dataset:
+        metadata: MutableMapping = x  # type: ignore
+        task_id = metadata.pop("id")
+        question = metadata.pop("question")
+        gt = metadata.pop("answer")
+        image = metadata.pop("image")  # base64 encoded image
+        image_uri = save_image(image, data_dir, task_id)
+        metadata.pop("image_preview")
+        metadata.pop("rationale_image")
+        task = Task(
+            task_id=task_id,
+            task_question=question,
+            ground_truth=gt,
+            file_path=image_uri,
+            metadata=metadata,
+        )
+        yield task
+
+    return