Merge pull request #917 from epfml/NAN-hellaswag-gpt-evaluation-christinakopi

JulienVig · web-flow · commit 08b261650669 · 2025-06-25T16:06:21.000+02:00
Hellaswag GPT evaluation
diff --git a/cli/README.md b/cli/README.md
@@ -52,3 +52,26 @@ CLI options can be listed with `npm -w cli run benchmark_gpt -- -h`.
 To benchmark model training, you can run `npm -w cli run benchmark_gpt -- --modelType gpt-nano --contextLength 128 --batchSize 8`.
 
 For inference run `npm -w cli run benchmark_gpt -- --inference --modelPath <path to trained model json file>`. You can use the `docs/example/wikitext` example script to train a model. The model needs to be trained on the wikitext default task to ensure that model parameters such as vocab size, tokenizer, max sequence length are the same between training and inference.
+
+## Evaluating GPT Models on HellaSwag
+
+The CLI includes a script to evaluate GPT models on the [HellaSwag](https://rowanzellers.com/hellaswag/) dataset, a common benchmark for evaluating commonsense reasoning in language models.
+
+To run the evaluation: `npm -w cli run hellaswag_gpt`
+
+The script benchmarks the following models:
+- A TensorFlow.js implementation of GPT (`gpt-tfjs`)
+- A pre-trained ONNX model (`Xenova/gpt2`)
+
+Both models are evaluated using a shared tokenizer (`Xenova/gpt2`), and the script reports:
+- Accuracy (proportion of correct multiple-choice predictions)
+- Total evaluation time (in seconds)
+
+### Output
+
+Results are printed to the console and saved to a log file: `../datasets/logFile_hellaswag.txt`
+
+
+This allows for a direct comparison between the inference performance and accuracy of the two architectures.
+
+The TFJS implementation is generally slower and more memory-intensive than ONNX, but offers compatibility with browser-based environments and custom training workflows. See the [Benchmarking GPT-TF.js](#benchmarking-gpt-tfjs) section for more details on performance tradeoffs.
diff --git a/cli/package.json b/cli/package.json
@@ -8,6 +8,7 @@
     "start": "npm run build && node dist/cli.js",
     "benchmark_gpt": "npm run build && node dist/benchmark_gpt.js",
     "train_gpt": "npm run build && node dist/train_gpt.js",
+    "hellaswag_gpt": "npm run build && node dist/hellaswag_gpt.js",
     "build": "tsc",
     "lint": "npx eslint .",
     "test": ": nothing"
diff --git a/cli/src/hellaswag_gpt.ts b/cli/src/hellaswag_gpt.ts
@@ -0,0 +1,54 @@
+import '@tensorflow/tfjs-node';
+import { loadHellaSwag } from '@epfml/discojs-node';
+import { models } from '@epfml/discojs';
+import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers';
+import fs from 'fs';
+import path from 'node:path';
+
+const logFile = path.join('..', 'datasets', 'LogFile_hellaswag.txt');
+const logLines: string[] = [];
+
+function log(message: string) {
+    console.log(message);
+    logLines.push(message);
+}
+
+const hellaswagDataset: models.HellaSwagDataset = await loadHellaSwag(-1)
+
+async function evaluateTFJS(tokenizer: PreTrainedTokenizer) {
+    const model = new models.GPT({ seed: 42 });
+    log('Evaluating TFJS GPT on HellaSwag...');
+
+    const start = Date.now();
+    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
+    const duration = ((Date.now() - start) / 1000).toFixed(2);
+
+    log(`TFJS GPT Accuracy: ${(accuracy * 100).toFixed(2)}%`);
+    log(`TFJS GPT Evaluation Time: ${duration} seconds`);
+}
+
+async function evaluateXenova(tokenizer: PreTrainedTokenizer) {
+    const model = await models.ONNXModel.init_pretrained('Xenova/gpt2');
+    log('Evaluating Xenova GPT-2 (ONNX) on HellaSwag...');
+
+    const start = Date.now();
+    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
+    const duration = ((Date.now() - start) / 1000).toFixed(2);
+
+    log(`Xenova GPT-2 Accuracy: ${(accuracy * 100).toFixed(2)}%`);
+    log(`Xenova GPT-2 Evaluation Time: ${duration} seconds`);
+}
+
+async function main(): Promise<void> {
+    fs.writeFileSync(logFile, '', 'utf-8'); // Clear old log file
+
+    const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2');
+    await evaluateTFJS(tokenizer);
+    log('\n---\n');
+    await evaluateXenova(tokenizer);
+
+    fs.writeFileSync(logFile, logLines.join('\n'), 'utf-8');
+    console.log(`\nResults written to ${logFile}`);
+}
+
+main().catch(console.error);
diff --git a/discojs-node/src/hellaswag.spec.ts b/discojs-node/src/hellaswag.spec.ts
@@ -0,0 +1,18 @@
+import { expect } from 'chai';
+import { load as loadHellaSwag } from './hellaswag.js';
+
+describe('HellaSwag parser', () => {
+  it('should load all examples and return them as an array', async () => {
+    const dataset = await loadHellaSwag(10);
+
+    expect(dataset).to.be.an('array');
+    expect(dataset.length).to.be.greaterThan(0);
+
+    // Check the structure of the first example
+    const example = dataset[0];
+    expect(example).to.have.property('ctx').that.is.a('string');
+    expect(example).to.have.property('endings').that.is.an('array').with.lengthOf(4);
+    expect(example).to.have.property('label').that.is.a('number');
+  });
+});
+
diff --git a/discojs-node/src/hellaswag.ts b/discojs-node/src/hellaswag.ts
@@ -0,0 +1,36 @@
+import { models } from '@epfml/discojs';
+import fetch from 'node-fetch';
+
+/**
+ * Loads the HellaSwag dataset from the remote URL in Node.js
+ * 
+ * @param limit - Maximum number of examples to load (-1 means all)
+ * @returns A HellaSwagDataset containing the examples.
+ */
+export async function load(limit = -1): Promise<models.HellaSwagDataset> {
+  const response = await fetch(models.HELLASWAG_URL);
+  if (!response.ok) {
+    throw new Error(`Failed to fetch dataset from ${models.HELLASWAG_URL}: ${response.statusText}`);
+  }
+
+  const text = await response.text();
+  const lines = text.split('\n');
+
+  const dataset: models.HellaSwagDataset = [];
+  let count = 0;
+  for (const line of lines) {
+    if (line.trim().length === 0) continue;
+    if (limit !== -1 && count >= limit) break;
+
+    try {
+      const data = JSON.parse(line.trim()) as models.HellaSwagExample;
+      dataset.push(data);
+      count++;
+    } catch (e) {
+      console.error(`Failed to parse line:`, line);
+      throw e;
+    }
+  }
+
+  return dataset;
+}
diff --git a/discojs-node/src/index.ts b/discojs-node/src/index.ts
@@ -1,2 +1,3 @@
 export * from './loaders/index.js'
 export { saveModelToDisk, loadModelFromDisk } from './model_loader.js'
+export { load as loadHellaSwag } from './hellaswag.js'
diff --git a/discojs-web/src/hellaswag.spec.ts b/discojs-web/src/hellaswag.spec.ts
@@ -0,0 +1,19 @@
+import { describe, it, expect } from "vitest";
+import { load as loadHellaSwag } from './hellaswag.js';
+import { models } from '@epfml/discojs';
+
+describe('hellaswag parser', () => {
+  it('loads the whole hellaswag dataset', async () => {
+    const dataset: models.HellaSwagDataset = await loadHellaSwag(2);
+
+    // basic assertions
+    expect(dataset).to.be.an('array');
+    expect(dataset.length).to.equal(2);
+
+    // check structure of the first example
+    const first = dataset[0];
+    expect(first).to.have.property('ctx').that.is.a('string');
+    expect(first).to.have.property('endings').that.is.an('array').with.lengthOf(4);
+    expect(first).to.have.property('label').that.is.a('number');
+  });
+});
diff --git a/discojs-web/src/hellaswag.ts b/discojs-web/src/hellaswag.ts
@@ -0,0 +1,35 @@
+import { models } from '@epfml/discojs';
+
+/**
+ * Loads the HellaSwag dataset from the remote URL in the browser
+ * 
+ * @param limit - Maximum number of examples to load (-1 means all)
+ * @returns A HellaSwagDataset containing the examples
+ */
+export async function load(limit = -1): Promise<models.HellaSwagDataset> {
+  const response = await fetch(models.HELLASWAG_URL);
+  if (!response.ok) {
+    throw new Error(`Failed to fetch dataset from ${models.HELLASWAG_URL}: ${response.statusText}`);
+  }
+
+  const text = await response.text();
+  const lines = text.split('\n');
+
+  const dataset: models.HellaSwagDataset = [];
+  let count = 0;
+  for (const line of lines) {
+    if (line.trim().length === 0) continue;
+    if (limit !== -1 && count >= limit) break;
+
+    try {
+      const data = JSON.parse(line.trim()) as models.HellaSwagExample;
+      dataset.push(data);
+      count++;
+    } catch (e) {
+      console.error(`Failed to parse line:`, line);
+      throw e;
+    }
+  }
+
+  return dataset;
+}
diff --git a/discojs-web/src/index.ts b/discojs-web/src/index.ts
@@ -1 +1,2 @@
 export * from "./loaders/index.js";
+export { load as loadHellaSwag } from "./hellaswag.js";
diff --git a/discojs/src/models/hellaswag.spec.ts b/discojs/src/models/hellaswag.spec.ts
@@ -0,0 +1,63 @@
+import { expect } from 'chai';
+import { evaluate } from './hellaswag.js';
+import { PreTrainedTokenizer } from '@xenova/transformers';
+import { GPT } from './index.js';
+import { ONNXModel } from './onnx.js';
+import type { HellaSwagExample } from './hellaswag.js';
+
+export const exampleDataset: HellaSwagExample[] = [
+  {
+    ctx: "A man is sitting on a roof. he",
+    endings: [
+      "is using wrap to wrap a pair of skis.",
+      "is ripping level tiles off.",
+      "is holding a rubik's cube.",
+      "starts pulling up roofing on a roof."
+    ],
+    label: 3
+  },
+  {
+    ctx: "A lady walks to a barbell. She bends down and grabs the pole. the lady",
+    endings: [
+      "swings and lands in her arms.",
+      "pulls the barbell forward.",
+      "pulls a rope attached to the barbell.",
+      "stands and lifts the weight over her head."
+    ],
+    label: 3
+  }
+];
+
+describe('HellaSwag Evaluator', () => {
+  it('evaluates tfjs GPT model', async () => {
+    const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2');
+    const gpt = new GPT({seed: 42,}); // seed for reproducibility
+
+    const accuracy = await evaluate(gpt, tokenizer, exampleDataset, true);
+    expect(accuracy).to.be.gte(0);
+    expect(accuracy).to.be.lte(1);
+  }).timeout(6000);
+});
+
+describe('HellaSwag Evaluator with Xenova GPT-2', () => {
+  it('evaluates the pretrained GPT-2 model', async () => {
+    const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2');
+    const model = await ONNXModel.init_pretrained('Xenova/gpt2');
+
+    const accuracy = await evaluate(model, tokenizer, exampleDataset, true);
+    expect(accuracy).to.be.gte(0);
+    expect(accuracy).to.be.lte(1);
+  }).timeout(10000);
+});
+
+describe('Deterministic evaluation with tfjs GPT-2', () => {
+  it('returns the same accuracy across runs', async () => {
+    const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2');
+    const gpt = new GPT({seed: 42,});
+
+    const accuracy1 = await evaluate(gpt, tokenizer, exampleDataset, false);
+    const accuracy2 = await evaluate(gpt, tokenizer, exampleDataset, false);
+
+    expect(accuracy1).to.equal(accuracy2);
+  }).timeout(10000);
+});
diff --git a/discojs/src/models/hellaswag.ts b/discojs/src/models/hellaswag.ts
diff --git a/discojs/src/models/index.ts b/discojs/src/models/index.ts
diff --git a/discojs/src/models/onnx.spec.ts b/discojs/src/models/onnx.spec.ts
diff --git a/discojs/src/models/onnx.ts b/discojs/src/models/onnx.ts

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`export * from './loaders/index.js'`
`2`	`2`	`export { saveModelToDisk, loadModelFromDisk } from './model_loader.js'`
	`3`	`+export { load as loadHellaSwag } from './hellaswag.js'`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`export * from "./loaders/index.js";`
	`2`	`+export { load as loadHellaSwag } from "./hellaswag.js";`