Skip to content

Commit 694d73c

Browse files
committed
feat: load hard data
1 parent 6e53f63 commit 694d73c

File tree

2 files changed

+14
-16
lines changed

2 files changed

+14
-16
lines changed

bigcodebench/data/bigcodebench.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,19 @@
1616
BIGCODEBENCH_HF = "bigcode/bigcodebench"
1717
BIGCODEBENCH_VERSION = "v0.1.0_hf"
1818

19-
def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") -> str:
19+
def _ready_bigcodebench_path(hard=False, version="default") -> str:
2020
if BIGCODEBENCH_OVERRIDE_PATH:
2121
return BIGCODEBENCH_OVERRIDE_PATH
2222

2323
version = BIGCODEBENCH_VERSION if version == "default" else version
2424
url, path = get_dataset_metadata(
25-
"BigCodeBench", BIGCODEBENCH_VERSION, mini, noextreme
25+
BIGCODEBENCH_VERSION, hard
2626
)
2727

28+
extra = "-hard" if hard else ""
29+
2830
try:
29-
dataset = load_dataset(BIGCODEBENCH_HF, split=BIGCODEBENCH_VERSION)
31+
dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
3032
make_cache(url, dataset, path)
3133
except:
3234
if os.path.exists(path):
@@ -37,7 +39,7 @@ def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") ->
3739

3840

3941
def get_bigcodebench(
40-
err_incomplete=True, mini=False, noextreme=False, version="default"
42+
err_incomplete=True, hard=False, version="default"
4143
) -> Dict[str, Dict]:
4244
"""Get BigCodeBench from BigCode's github repo and return as a list of parsed dicts.
4345
@@ -54,19 +56,19 @@ def get_bigcodebench(
5456
"""
5557
# Check if open eval file exists in CACHE_DIR
5658
data_path = _ready_bigcodebench_path(
57-
mini=mini, noextreme=noextreme, version=version
59+
hard=hard, version=version
5860
)
5961
data = {task["task_id"]: task for task in stream_jsonl(data_path)}
6062
if err_incomplete:
6163
completeness_check("BigCodeBench", data)
6264
return data
6365

64-
def get_bigcodebench_hash(mini=False, noextreme=False, version="default") -> str:
66+
def get_bigcodebench_hash(hard=False, version="default") -> str:
6567
"""Get the hash of BigCodeBench.
6668
Returns:
6769
str: The hash of BigCodeBench
6870
"""
69-
data_path = _ready_bigcodebench_path(mini, noextreme, version="default")
71+
data_path = _ready_bigcodebench_path(hard, version="default")
7072
with open(data_path, "rb") as f:
7173
data = f.read()
7274
return hashlib.md5(data).hexdigest()

bigcodebench/data/utils.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@
1111
CACHE_DIR = user_cache_dir("bigcodebench")
1212

1313

14-
def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool = False):
15-
assert name in ["BigCodeBench"], f"Unknown/unsupported dataset: {name}"
14+
def get_dataset_metadata(version: str, hard: bool = False):
1615
extra = ""
17-
assert not (mini and noextreme), "Cannot have both mini and noextreme"
18-
if mini:
19-
extra = "-Mini"
20-
if noextreme:
21-
extra = "-NoExtreme"
22-
url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/{name}{extra}.jsonl.gz"
23-
cache_path = os.path.join(CACHE_DIR, f"{name}{extra}-{version}.jsonl")
16+
if hard:
17+
extra = "-Hard"
18+
url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/BigCodeBench{extra}.jsonl.gz"
19+
cache_path = os.path.join(CACHE_DIR, f"BigCodeBench{extra}-{version}.jsonl")
2420
return url, cache_path
2521

2622

0 commit comments

Comments
 (0)