-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjustfile
More file actions
182 lines (144 loc) · 5.95 KB
/
justfile
File metadata and controls
182 lines (144 loc) · 5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# basic-memory-benchmarks command runner
set dotenv-load := true
# --- Paths and defaults ---
bm_local_path := env_var_or_default("BM_LOCAL_PATH", "")
bm_local_path_flag := if bm_local_path != "" { "--bm-local-path " + bm_local_path } else { "" }
locomo_dataset_path := "benchmarks/datasets/locomo/locomo10.json"
locomo_output_dir := "benchmarks/generated/locomo"
locomo_c1_output_dir := "benchmarks/generated/locomo-c1"
# --- Repo maintenance ---
sync:
uv sync --group dev
sync-judge:
uv sync --group dev --extra judge
test:
uv run pytest -q
lint:
uv run ruff check .
format:
uv run ruff format .
typecheck:
uv run pyright
check: lint typecheck test
# --- Dataset prep ---
bench-fetch-locomo:
uv run bm-bench datasets fetch --dataset locomo --output {{locomo_dataset_path}}
bench-convert-locomo:
uv run bm-bench convert locomo --dataset-path {{locomo_dataset_path}} --output-dir {{locomo_output_dir}}
bench-convert-locomo-c1:
uv run bm-bench convert locomo --dataset-path {{locomo_dataset_path}} --output-dir {{locomo_c1_output_dir}} --max-conversations 1
bench-make-quick25:
uv run python -c 'import json; from pathlib import Path; queries_path=Path("benchmarks/generated/locomo-c1/queries.json"); quick_path=Path("benchmarks/generated/locomo-c1/queries.quick25.json"); queries=json.loads(queries_path.read_text()); quick_path.write_text(json.dumps(queries[:25], indent=2)+"\n"); print(f"Wrote {len(queries[:25])} queries to {quick_path}")'
bench-prepare-short: bench-fetch-locomo bench-convert-locomo-c1 bench-make-quick25
bench-prepare-long: bench-fetch-locomo bench-convert-locomo
# --- One-command pipelines ---
# Full retrieval benchmark pipeline:
# 1) sync deps, 2) fetch+convert long dataset, 3) run full retrieval
bench-full:
just sync
just bench-prepare-long
just bench-run-full
# Full retrieval + judge pipeline:
# 1) sync deps (+judge extras), 2) fetch+convert long dataset, 3) run full with judge
bench-full-judge model="gpt-4o-mini":
just sync-judge
just bench-prepare-long
just bench-run-full-judge model="{{model}}"
# --- Benchmark execution ---
bench-smoke:
uv run bm-bench run retrieval \
--dataset-id synthetic \
--dataset-path benchmarks/synthetic/queries.json \
--corpus-dir benchmarks/synthetic/docs \
--queries-path benchmarks/synthetic/queries.json \
--providers bm-local,mem0-local \
--allow-provider-skip
# Short benchmark: one-conversation LoCoMo slice + 25-query quickset
bench-run-short:
uv run bm-bench run retrieval \
--dataset-id locomo-c1-quick25 \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo-c1/docs \
--queries-path benchmarks/generated/locomo-c1/queries.quick25.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--allow-provider-skip
bench-run-short-strict:
uv run bm-bench run retrieval \
--dataset-id locomo-c1-quick25 \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo-c1/docs \
--queries-path benchmarks/generated/locomo-c1/queries.quick25.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--strict-providers
# Long benchmark: full LoCoMo query set
bench-run-long:
uv run bm-bench run retrieval \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--allow-provider-skip
bench-run-long-strict:
uv run bm-bench run retrieval \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--strict-providers
bench-run-bm-local:
uv run bm-bench run retrieval \
--providers bm-local \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
{{bm_local_path_flag}}
bench-run-mem0-local:
uv run bm-bench run retrieval \
--providers mem0-local \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
--allow-provider-skip
bench-run-full:
uv run bm-bench run full \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--allow-provider-skip
bench-run-full-judge model="gpt-4o-mini":
uv run bm-bench run full \
--dataset-id locomo \
--dataset-path {{locomo_dataset_path}} \
--corpus-dir benchmarks/generated/locomo/docs \
--queries-path benchmarks/generated/locomo/queries.json \
--providers bm-local,mem0-local \
{{bm_local_path_flag}} \
--allow-provider-skip \
--judge \
--judge-model "{{model}}"
# --- Artifacts and comparison ---
bench-latest-run:
#!/usr/bin/env bash
set -euo pipefail
ls -1dt benchmarks/runs/* | head -n 1
bench-judge run_dir model="gpt-4o-mini":
uv run bm-bench run judge --run-dir "{{run_dir}}" --model "{{model}}"
bench-validate run_dir:
uv run bm-bench validate-artifacts --run-dir "{{run_dir}}"
bench-publish run_dir destination="benchmarks/results/public":
uv run bm-bench publish --run-dir "{{run_dir}}" --destination "{{destination}}"
bench-compare baseline candidate provider="bm-local" metric="recall_at_5":
uv run bm-bench compare "{{baseline}}" "{{candidate}}" --provider "{{provider}}" --metric "{{metric}}"
default:
@just --list