basic-memory-benchmarks/justfile at main · basicmachines-co/basic-memory-benchmarks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# basic-memory-benchmarks command runner

set dotenv-load := true

# --- Paths and defaults ---

bm_local_path := env_var_or_default("BM_LOCAL_PATH", "")
bm_local_path_flag := if bm_local_path != "" { "--bm-local-path " + bm_local_path } else { "" }
locomo_dataset_path := "benchmarks/datasets/locomo/locomo10.json"
locomo_output_dir := "benchmarks/generated/locomo"
locomo_c1_output_dir := "benchmarks/generated/locomo-c1"

# --- Repo maintenance ---

sync:
    uv sync --group dev

sync-judge:
    uv sync --group dev --extra judge

test:
    uv run pytest -q

lint:
    uv run ruff check .

format:
    uv run ruff format .

typecheck:
    uv run pyright

check: lint typecheck test

# --- Dataset prep ---

bench-fetch-locomo:
    uv run bm-bench datasets fetch --dataset locomo --output {{locomo_dataset_path}}

bench-convert-locomo:
    uv run bm-bench convert locomo --dataset-path {{locomo_dataset_path}} --output-dir {{locomo_output_dir}}

bench-convert-locomo-c1:
    uv run bm-bench convert locomo --dataset-path {{locomo_dataset_path}} --output-dir {{locomo_c1_output_dir}} --max-conversations 1

bench-make-quick25:
    uv run python -c 'import json; from pathlib import Path; queries_path=Path("benchmarks/generated/locomo-c1/queries.json"); quick_path=Path("benchmarks/generated/locomo-c1/queries.quick25.json"); queries=json.loads(queries_path.read_text()); quick_path.write_text(json.dumps(queries[:25], indent=2)+"\n"); print(f"Wrote {len(queries[:25])} queries to {quick_path}")'

bench-prepare-short: bench-fetch-locomo bench-convert-locomo-c1 bench-make-quick25

bench-prepare-long: bench-fetch-locomo bench-convert-locomo

# --- One-command pipelines ---

# Full retrieval benchmark pipeline:
# 1) sync deps, 2) fetch+convert long dataset, 3) run full retrieval
bench-full:
    just sync
    just bench-prepare-long
    just bench-run-full

# Full retrieval + judge pipeline:
# 1) sync deps (+judge extras), 2) fetch+convert long dataset, 3) run full with judge
bench-full-judge model="gpt-4o-mini":
    just sync-judge
    just bench-prepare-long
    just bench-run-full-judge model="{{model}}"

# --- Benchmark execution ---

bench-smoke:
    uv run bm-bench run retrieval \
      --dataset-id synthetic \
      --dataset-path benchmarks/synthetic/queries.json \
      --corpus-dir benchmarks/synthetic/docs \
      --queries-path benchmarks/synthetic/queries.json \
      --providers bm-local,mem0-local \
      --allow-provider-skip

# Short benchmark: one-conversation LoCoMo slice + 25-query quickset
bench-run-short:
    uv run bm-bench run retrieval \
      --dataset-id locomo-c1-quick25 \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo-c1/docs \
      --queries-path benchmarks/generated/locomo-c1/queries.quick25.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --allow-provider-skip

bench-run-short-strict:
    uv run bm-bench run retrieval \
      --dataset-id locomo-c1-quick25 \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo-c1/docs \
      --queries-path benchmarks/generated/locomo-c1/queries.quick25.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --strict-providers

# Long benchmark: full LoCoMo query set
bench-run-long:
    uv run bm-bench run retrieval \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --allow-provider-skip

bench-run-long-strict:
    uv run bm-bench run retrieval \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --strict-providers

bench-run-bm-local:
    uv run bm-bench run retrieval \
      --providers bm-local \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      {{bm_local_path_flag}}

bench-run-mem0-local:
    uv run bm-bench run retrieval \
      --providers mem0-local \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      --allow-provider-skip

bench-run-full:
    uv run bm-bench run full \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --allow-provider-skip

bench-run-full-judge model="gpt-4o-mini":
    uv run bm-bench run full \
      --dataset-id locomo \
      --dataset-path {{locomo_dataset_path}} \
      --corpus-dir benchmarks/generated/locomo/docs \
      --queries-path benchmarks/generated/locomo/queries.json \
      --providers bm-local,mem0-local \
      {{bm_local_path_flag}} \
      --allow-provider-skip \
      --judge \
      --judge-model "{{model}}"

# --- Artifacts and comparison ---

bench-latest-run:
    #!/usr/bin/env bash
    set -euo pipefail
    ls -1dt benchmarks/runs/* | head -n 1

bench-judge run_dir model="gpt-4o-mini":
    uv run bm-bench run judge --run-dir "{{run_dir}}" --model "{{model}}"

bench-validate run_dir:
    uv run bm-bench validate-artifacts --run-dir "{{run_dir}}"

bench-publish run_dir destination="benchmarks/results/public":
    uv run bm-bench publish --run-dir "{{run_dir}}" --destination "{{destination}}"

bench-compare baseline candidate provider="bm-local" metric="recall_at_5":
    uv run bm-bench compare "{{baseline}}" "{{candidate}}" --provider "{{provider}}" --metric "{{metric}}"

default:
    @just --list