Skip to content

Commit 7baed80

Browse files
Sela TachnaiSela Tachnai
authored andcommitted
Add new benchmark results and enhance data loading and test automation
- Added a new JSON file containing benchmark results for the model "gemma3:1b" with various tasks including summarization, translation, and coding. - Updated `data-loader.ts` to support both old and new manifest formats, ensuring compatibility with different data structures. - Improved model name normalization and display functions for consistency and clarity. - Modified `run_all_tests.py` to include a timestamp in output filenames to prevent overwrites and added provider information for better distinction between results.
1 parent 8539591 commit 7baed80

10 files changed

+3322
-20
lines changed

energy-leaderboard-web/public/data/output_gemma3_1b_easy_ollama_20260112_165131.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_easy_ollama_20260112_165534.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_hard_ollama_20260112_165131.json

Lines changed: 413 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_hard_ollama_20260112_165534.json

Lines changed: 413 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_medium_ollama_20260112_165131.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_medium_ollama_20260112_165534.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_mixed_ollama_20260112_165131.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/public/data/output_gemma3_1b_mixed_ollama_20260112_165534.json

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

energy-leaderboard-web/src/lib/data-loader.ts

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ export async function loadAllBenchmarks(): Promise<{ filename: string; results:
1919
console.warn('Failed to load manifest.json, falling back to empty list');
2020
return [];
2121
}
22-
const files: string[] = await manifestResponse.json();
22+
const manifest = await manifestResponse.json();
23+
24+
// Support both old array format and new object format with metadata
25+
const files: string[] = Array.isArray(manifest) ? manifest : manifest.files || [];
2326

2427
const loadPromises = files.map(async (filename) => {
2528
try {
@@ -89,25 +92,25 @@ export function detectDeviceType(result: BenchmarkResult, filename: string): Dev
8992
*/
9093
export function normalizeModelName(model: string): string {
9194
let normalized = model;
92-
95+
9396
// Remove file extensions
9497
normalized = normalized.replace(/\.(gguf|bin|safetensors)$/i, '');
95-
98+
9699
// Remove shard indicators (e.g., "-00001-of-00002")
97100
normalized = normalized.replace(/-\d{5}-of-\d{5}$/i, '');
98-
101+
99102
// Remove quantization/format suffixes (e.g., "-MXFP4", "-Q4_K_M", "-q4_k_m")
100103
normalized = normalized.replace(/[-_](MXFP\d|Q\d[_A-Z]*\d*[_A-Z]*)$/i, '');
101-
104+
102105
// Standardize separator: "model:size" -> "model-size"
103106
normalized = normalized.replace(/:/g, '-');
104-
107+
105108
// Remove instruct/chat suffixes for base model grouping (optional, keeps instruct for now)
106109
// normalized = normalized.replace(/-instruct|-chat$/i, '');
107-
110+
108111
// Lowercase for consistent grouping
109112
normalized = normalized.toLowerCase();
110-
113+
111114
return normalized;
112115
}
113116

@@ -117,13 +120,13 @@ export function normalizeModelName(model: string): string {
117120
export function getDisplayModelName(model: string): string {
118121
// Clean up the model name for display (less aggressive than normalize)
119122
let display = model;
120-
123+
121124
// Remove file extensions
122125
display = display.replace(/\.(gguf|bin|safetensors)$/i, '');
123-
126+
124127
// Remove shard indicators
125128
display = display.replace(/-\d{5}-of-\d{5}$/i, '');
126-
129+
127130
return display;
128131
}
129132

@@ -314,14 +317,14 @@ export function aggregateByModelOnly(
314317
// Use the shortest/cleanest raw model name for display, or the normalized name
315318
const rawNamesArray = Array.from(rawModelNames);
316319
const model = getDisplayModelName(
317-
rawNamesArray.reduce((shortest, name) =>
320+
rawNamesArray.reduce((shortest, name) =>
318321
getDisplayModelName(name).length < getDisplayModelName(shortest).length ? name : shortest
319322
)
320323
);
321324
// Show all providers used
322325
const providerArray = Array.from(providers).sort();
323-
const provider = providerArray.length > 1
324-
? providerArray.join(', ')
326+
const provider = providerArray.length > 1
327+
? providerArray.join(', ')
325328
: providerArray[0];
326329
const region = firstResult.region;
327330

@@ -343,8 +346,8 @@ export function aggregateByModelOnly(
343346

344347
// For cross-hardware view, show "Multiple" or list of devices
345348
const deviceTypeArray = Array.from(deviceTypes).sort();
346-
const deviceName = deviceTypeArray.length > 1
347-
? `${deviceTypeArray.length} devices`
349+
const deviceName = deviceTypeArray.length > 1
350+
? `${deviceTypeArray.length} devices`
348351
: getDeviceDisplayName(deviceTypeArray[0]);
349352

350353
stats.push({

run_all_tests.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ def main():
4848

4949
args = parser.parse_args()
5050

51-
# Get current date for filenames
52-
date_str = datetime.datetime.now().strftime("%d_%m")
51+
# Get current date and time for filenames (includes time to prevent overwrites)
52+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
5353
sanitized_model = sanitize_model_name(args.model)
5454

5555
# Ensure results directory exists
@@ -58,13 +58,14 @@ def main():
5858

5959
print(f"Starting automation for model: {args.model}")
6060
print(f"Provider: {args.provider}")
61-
print(f"Date: {date_str}")
61+
print(f"Timestamp: {timestamp}")
6262
print("-" * 50)
6363

6464
for test_set in TEST_SETS:
6565
print(f"\n>>> Running test set: {test_set}")
6666

67-
output_filename = f"output_{sanitized_model}_{test_set}_{date_str}.json"
67+
# Include provider in filename to distinguish between backends (e.g., ollama vs llama.cpp)
68+
output_filename = f"output_{sanitized_model}_{test_set}_{args.provider}_{timestamp}.json"
6869
output_path = results_dir / output_filename
6970

7071
cmd = [

0 commit comments

Comments
 (0)