|
| 1 | +import { expect } from 'chai'; |
| 2 | +import { evaluate } from './hellaswag.js'; |
| 3 | +import { PreTrainedTokenizer } from '@xenova/transformers'; |
| 4 | +import { GPT } from './index.js'; |
| 5 | +import { ONNXModel } from './onnx.js'; |
| 6 | +import type { HellaSwagExample } from './hellaswag.js'; |
| 7 | + |
| 8 | +export const exampleDataset: HellaSwagExample[] = [ |
| 9 | + { |
| 10 | + ctx: "A man is sitting on a roof. he", |
| 11 | + endings: [ |
| 12 | + "is using wrap to wrap a pair of skis.", |
| 13 | + "is ripping level tiles off.", |
| 14 | + "is holding a rubik's cube.", |
| 15 | + "starts pulling up roofing on a roof." |
| 16 | + ], |
| 17 | + label: 3 |
| 18 | + }, |
| 19 | + { |
| 20 | + ctx: "A lady walks to a barbell. She bends down and grabs the pole. the lady", |
| 21 | + endings: [ |
| 22 | + "swings and lands in her arms.", |
| 23 | + "pulls the barbell forward.", |
| 24 | + "pulls a rope attached to the barbell.", |
| 25 | + "stands and lifts the weight over her head." |
| 26 | + ], |
| 27 | + label: 3 |
| 28 | + } |
| 29 | +]; |
| 30 | + |
| 31 | +describe('HellaSwag Evaluator', () => { |
| 32 | + it('evaluates tfjs GPT model', async () => { |
| 33 | + const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2'); |
| 34 | + const gpt = new GPT({seed: 42,}); // seed for reproducibility |
| 35 | + |
| 36 | + const accuracy = await evaluate(gpt, tokenizer, exampleDataset, true); |
| 37 | + expect(accuracy).to.be.gte(0); |
| 38 | + expect(accuracy).to.be.lte(1); |
| 39 | + }).timeout(6000); |
| 40 | +}); |
| 41 | + |
| 42 | +describe('HellaSwag Evaluator with Xenova GPT-2', () => { |
| 43 | + it('evaluates the pretrained GPT-2 model', async () => { |
| 44 | + const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2'); |
| 45 | + const model = await ONNXModel.init_pretrained('Xenova/gpt2'); |
| 46 | + |
| 47 | + const accuracy = await evaluate(model, tokenizer, exampleDataset, true); |
| 48 | + expect(accuracy).to.be.gte(0); |
| 49 | + expect(accuracy).to.be.lte(1); |
| 50 | + }).timeout(10000); |
| 51 | +}); |
| 52 | + |
| 53 | +describe('Deterministic evaluation with tfjs GPT-2', () => { |
| 54 | + it('returns the same accuracy across runs', async () => { |
| 55 | + const tokenizer = await PreTrainedTokenizer.from_pretrained('Xenova/gpt2'); |
| 56 | + const gpt = new GPT({seed: 42,}); |
| 57 | + |
| 58 | + const accuracy1 = await evaluate(gpt, tokenizer, exampleDataset, false); |
| 59 | + const accuracy2 = await evaluate(gpt, tokenizer, exampleDataset, false); |
| 60 | + |
| 61 | + expect(accuracy1).to.equal(accuracy2); |
| 62 | + }).timeout(10000); |
| 63 | +}); |
0 commit comments