epfml
diff --git a/‎cli/src/hellaswag_gpt.ts‎
Lines changed: 79 additions & 25 deletions b/‎cli/src/hellaswag_gpt.ts‎
Lines changed: 79 additions & 25 deletions
diff --git a/‎datasets/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎datasets/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎discojs/src/models/gpt/layers.spec.ts‎
Lines changed: 3 additions & 2 deletions b/‎discojs/src/models/gpt/layers.spec.ts‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎discojs/src/models/hellaswag.ts‎
Lines changed: 1 addition & 1 deletion b/‎discojs/src/models/hellaswag.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eslint.config.js‎
Lines changed: 2 additions & 0 deletions b/‎eslint.config.js‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnx-converter/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎onnx-converter/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎onnx-converter/README.md‎
Lines changed: 36 additions & 0 deletions b/‎onnx-converter/README.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎onnx-converter/package.json‎
Lines changed: 21 additions & 0 deletions b/‎onnx-converter/package.json‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎onnx-converter/src/convert_onnx.ts‎
Lines changed: 148 additions & 0 deletions b/‎onnx-converter/src/convert_onnx.ts‎
Lines changed: 148 additions & 0 deletions
@@ -1,50 +1,104 @@
+// import fs from 'fs';
+import fsPromise from 'node:fs/promises';
+
+import { dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { parse } from 'ts-command-line-args'
+
 import '@tensorflow/tfjs-node';
 import fs from 'node:fs';
 import path from 'node:path';
-import { Tokenizer, models } from '@epfml/discojs';
+import { models, serialization, Tokenizer } from '@epfml/discojs';
 import { loadHellaSwag } from '@epfml/discojs-node';
+// import { AutoTokenizer } from '@xenova/transformers';
 
-const logFile = path.join('..', 'datasets', 'LogFile_hellaswag.txt');
-const logLines: string[] = [];
+const __dirname = dirname(fileURLToPath(import.meta.url));
 
+const logLines: string[] = [];
 function log(message: string) {
     console.log(message);
     logLines.push(message);
 }
 
-const hellaswagDataset: models.HellaSwagDataset = await loadHellaSwag(-1)
-
-async function evaluateTFJS(tokenizer: Tokenizer) {
-    const model = new models.GPT({ seed: 42 });
-    log('Evaluating TFJS GPT on HellaSwag...');
+async function evaluateModel(model: models.GPT | models.ONNXModel, numDataPoints = -1) {
+    const hellaswagDataset: models.HellaSwagDataset = await loadHellaSwag(numDataPoints)
+    const tokenizer = await Tokenizer.from_pretrained('Xenova/gpt2');
+    log('Starting the HellaSwag benchmark...');
 
     const start = Date.now();
-    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
+    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, true);
     const duration = ((Date.now() - start) / 1000).toFixed(2);
 
-    log(`TFJS GPT Accuracy: ${(accuracy * 100).toFixed(2)}%`);
-    log(`TFJS GPT Evaluation Time: ${duration} seconds`);
+    log(`Final accuracy: ${(accuracy * 100).toFixed(2)}%`);
+    log(`Evaluation Time: ${duration} seconds`);
 }
 
-async function evaluateXenova(tokenizer: Tokenizer) {
-    const model = await models.ONNXModel.init_pretrained('Xenova/gpt2');
-    log('Evaluating Xenova GPT-2 (ONNX) on HellaSwag...');
+const ModelTypes = ['onnx', 'gpt-tfjs-random', 'gpt-tfjs-pretrained'] as const;
+type ModelType = typeof ModelTypes[number];
 
-    const start = Date.now();
-    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
-    const duration = ((Date.now() - start) / 1000).toFixed(2);
-
-    log(`Xenova GPT-2 Accuracy: ${(accuracy * 100).toFixed(2)}%`);
-    log(`Xenova GPT-2 Evaluation Time: ${duration} seconds`);
+interface HellaSwagArgs {
+    model: ModelType
+    numDataPoints: number
+    logFile: string
+    pretrainedModelPath: string
+    help?: boolean
 }
 
 async function main(): Promise<void> {
-    fs.writeFileSync(logFile, '', 'utf-8'); // Clear old log file
+    const defaultPretrainedModelPath = path.join(__dirname, "..", "..", "onnx-converter", "assets", "model.json")
+    const args = parse<HellaSwagArgs>({
+        model: {
+            type: (raw: string) => raw as ModelType,
+            description: `Model type, one of ${ModelTypes}`,
+            defaultValue: 'onnx'
+        },
+        numDataPoints: {
+            type: Number,
+            description: 'Number of HellaSwag datapoints to evaluate, set -1 for the whole benchmark',
+            defaultValue: -1
+        },
+        logFile: {
+            type: String,
+            description: 'Relative path to the log file, default to ./hellaswag.log', defaultValue: 'hellaswag.log'
+        },
+        pretrainedModelPath: {
+            type: String,
+            description: 'If specifying gpt-tfjs-pretrained, provide the relative path to the TF.js pretrained model',
+            defaultValue: defaultPretrainedModelPath
+        },
+        help: {
+            type: Boolean,
+            optional: true,
+            alias: 'h',
+            description: 'Prints this usage guide'
+        }
+    }, { helpArg: 'help' })
 
-    const tokenizer = await Tokenizer.from_pretrained('Xenova/gpt2');
-    await evaluateTFJS(tokenizer);
-    log('\n---\n');
-    await evaluateXenova(tokenizer);
+    const logFile = path.join(__dirname, args.logFile);
+    fs.writeFileSync(logFile, '', 'utf-8'); // Clear the log file
+
+    let model: | models.GPT | models.ONNXModel | undefined;
+    switch (args.model) {
+        case 'onnx':
+            log("Using ONNX pretrained model Xenova/gpt2")
+            model = await models.ONNXModel.init_pretrained('Xenova/gpt2');
+            break;
+            case 'gpt-tfjs-random':
+            log("Using GPT-TFJS with random initialization")
+            model = new models.GPT({ seed: 42 });
+            break;
+            case 'gpt-tfjs-pretrained':
+            log("Using GPT-TFJS with pretrained weights")
+            if (args.pretrainedModelPath === undefined) {
+                throw new Error("If choosing gpt-tfjs-pretrained, provide the relative path to the TF.js pretrained model `pretrainedModelPath")
+            }
+            const encodedModel = await fsPromise.readFile(args.pretrainedModelPath);
+            model = await serialization.model.decode(encodedModel) as models.GPT;
+            break;
+        default:
+            throw new Error(`Unrecognized model type: ${model}`);
+    } 
+    await evaluateModel(model, args.numDataPoints);
 
     fs.writeFileSync(logFile, logLines.join('\n'), 'utf-8');
     console.log(`\nResults written to ${logFile}`);
 
@@ -20,3 +20,6 @@
 
 # GDHF demo
 /tinder_dog/
+
+# HellaSwag benchmark
+hellaswag*
@@ -174,8 +174,9 @@ describe('GPT Layers', () => {
       name: 'testCSA',
       contextLength: 5,
       nHead: 2,
-      nEmbd: 8,          // divisible by nHead, so head size = 4
-      dropout: 0.0,      // no dropout for deterministic tests
+      nEmbd: 8,      // divisible by nHead, so head size = 4
+      attnDrop: 0.0, // no dropout for deterministic tests
+      residDrop: 0.0,
       nLayer: 2,
       seed: 42
     };
 
@@ -126,7 +126,7 @@ type ModelType = GPT | ONNXModel;
 export async function evaluate(
   model: ModelType,
   tokenizer: Tokenizer,
-  dataset: HellaSwagExample[],
+  dataset: HellaSwagDataset,
   print = true
 ): Promise<number> {
   let correct = 0;
 
@@ -65,6 +65,8 @@ export default defineConfigWithVueTs(
 	},
 	{ ignores: ["**/dist/*"] },
 	{ ignores: ["docs/examples/**"] },
+	{ ignores: ["**/src/protobuf/"] },
+	
 	// don't use linter for formatting
 	skipFormatting,
 );
@@ -0,0 +1,3 @@
+node_modules
+assets
+dist
@@ -0,0 +1,36 @@
+## Usage
+
+This workspace is currently used to convert ONNX [GPT-2 model](https://huggingface.co/Xenova/gpt2) to Tensorflow.js. On the one hand, ONNX allows converting pretrained models from PyTorch or Tensorflow to the ONNX format, therefore there currently exists many pretrained models in ONNX format. However, ONNX libraries currently only support inference. On the other hand, Tensorflow.js doesn't have a converter that can handle recent Transformers models (despite having a [converter](https://github.com/tensorflow/tfjs/tree/master/tfjs-converter)), but TF.js allows further training models.
+
+Therefore, we want to convert pretrained models such as GPT-2 from ONNX format to Tensorflow.js to further fine-tune them. You generate a TF.js `model.json` by running `npm run convert_onnx` in this workspace.
+
+What the script does is:
+1. Read the ONNX GPT-2 model from [Xenova's repository](https://huggingface.co/Xenova/gpt2)
+2. Use the ONNX protobuf definition to read the file and iterate through the model layers. The ONNX JavaScript protobuf comes from [this repository](https://github.com/microsoft/onnxruntime/blob/main/js/web/lib/onnxjs/).
+3. Convert all weights to TF.js tensors
+4. Init a TF.js model with the loaded weights and export the model
+
+Running `npm run convert_onnx` creates a GPT-tfjs `model.json` file in the `./assets/` folder.
+
+## ONNX JS protobuf
+
+The ONNX specification has limited support in JavaScript. We found an old JS implementation in the [ONNX Runtime Web repository](https://github.com/microsoft/onnxruntime/tree/main/js/web/lib/onnxjs/ort-schema/protobuf). We had to adapt their files as follows to be compatible with our newer environment:
+1. Copy `onnx.js` and `onnx.d.ts` from [the repository](https://github.com/microsoft/onnxruntime/tree/main/js/web/lib/onnxjs/ort-schema/protobuf) in `./onnx-converter/src/protobuf`
+2. Rename `onnx.js` to `onnx.cjs`
+3. Create `onnx-proto.js` as a wrapper around the protobuf definition:
+```js
+import { createRequire } from 'module';
+const require = createRequire(import.meta.url);
+const onnxModule = require('./onnx.cjs');
+
+export const onnx = onnxModule.onnx;
+export default onnxModule;
+```
+4. Create `onnx-proto.d.ts` with the matching TypeScript definition:
+```ts
+export { onnx } from './onnx.js';
+declare const onnxModule: {
+  onnx: typeof import('./onnx.js').onnx;
+};
+export default onnxModule;
+```
@@ -0,0 +1,21 @@
+{
+  "name": "onnx-converter",
+  "private": true,
+  "type": "module",
+  "main": "dist/gpt2_from_onnx.js",
+  "scripts": {
+    "convert_onnx": "npm run build && node dist/convert_onnx.js",
+    "build": "tsc && cp -r src/protobuf dist",
+    "lint": "npx eslint .",
+    "test": ": nothing"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "@epfml/discojs-node": "*"
+  },
+  "devDependencies": {
+    "nodemon": "3",
+    "ts-command-line-args": "2"
+  }
+}
@@ -0,0 +1,148 @@
+import { onnx } from './protobuf/onnx-proto.js';
+import { Map, Range } from 'immutable';
+import { fileURLToPath } from 'url';
+import { dirname } from 'path';
+import path from 'node:path';
+import fsPromise from 'node:fs/promises';
+import * as tf from '@tensorflow/tfjs-node';
+
+import { models, serialization } from "@epfml/discojs";
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const ASSET_FOLDER = path.join(__dirname, "..", "assets");
+const OUTPUT_FILENAME = path.join(ASSET_FOLDER, "model.json");
+
+const GPT2_N_LAYER = 12;
+
+
+async function main() {
+  const ONNX_URL = "https://huggingface.co/Xenova/gpt2/resolve/main/onnx/decoder_model.onnx?download=true"
+  console.log(`Downloading ONNX model from ${ONNX_URL}...`);
+  const response = await fetch(ONNX_URL);
+  if (!response.ok)
+    throw new Error(`Failed to fetch ONNX model from ${ONNX_URL}: ${response.statusText}`);
+  const arrayBuffer = await response.arrayBuffer();
+  const data = new Uint8Array(arrayBuffer);
+
+  console.log(`Download complete (${(data.length / 1024 / 1024).toFixed(2)} MB).`);
+  console.log(`Decoding protobuf...`);
+
+  const onnxModel = onnx.ModelProto.decode(data)
+  
+  if (!onnxModel.graph || !onnxModel.graph.initializer)
+    throw new Error("No graph or tensors found in the ONNX model.");
+  console.log('ONNX model loaded successfully');
+  
+  
+  // Init empty TF.js model
+  // Context length value from https://huggingface.co/Xenova/gpt2/blob/main/config.json
+  const gptModel = new models.GPT({ modelType: 'gpt2', contextLength: 1024 });
+  if (gptModel.config.nLayer != GPT2_N_LAYER)
+    throw new Error(`ONNX conversion only supports GPT-2 with 12 layers, instead found ${gptModel.config.nLayer}.`);
+  const gptLayersModel = gptModel.extract();
+
+  console.log("Converting ONNX tensors to TF.js tensors")
+  // Layer name mapping between ONNX and TF.js
+  const onnxTfjsMapping = createWeightNameMap();
+  // Create a mapping between layer name and TF.js weight tensors
+  let preTrainedWeights = Map<string, tf.Tensor>(); // layer name to weight tensor
+  for (const tensor of onnxModel.graph.initializer) {
+    if (tensor.name === undefined || tensor.name === null)
+      throw new Error("Undefined layer named")
+      
+    const tfjsName = onnxTfjsMapping.get(tensor.name);
+    if (tfjsName === undefined)
+      throw new Error(`Missing ONNX weight in layer mapping: ${tensor.name}`);
+    if (preTrainedWeights.get(tfjsName))
+      throw new Error(`Duplicate weight name found: ${tfjsName}`);   
+    
+    if (tensor.dims === undefined || tensor.dims === null)
+      throw new Error(`Undefined layer dimensions for ${tensor.name}`)
+    const dims = tensor.dims.map((d) => Number(d));
+    const flatData = parseTensorData(tensor);
+    const tfTensor = tf.tensor(flatData).reshape(dims)
+    preTrainedWeights = preTrainedWeights.set(tfjsName, tfTensor);
+  }
+
+  console.log("Initializing a new TFJS GPT-2 model...")
+  if (preTrainedWeights.size !== onnxTfjsMapping.size)
+      throw new Error(`Expected to load ${onnxTfjsMapping.size} weights, but loaded ${preTrainedWeights.size}.`);
+  
+  // Overwrite the GPT-TF.js model weights with the ONNX weights
+  if (gptLayersModel.weights.length !== onnxTfjsMapping.size) 
+    throw new Error(`Mismatch between TFJS and ONNX weight mapping weights.`);
+    
+  const finalWeights = gptLayersModel.weights.map((weight, _i) => {
+    const newTensor = preTrainedWeights.get(weight.name);
+    if (newTensor === undefined)
+      throw new Error(`Missing ${weight.name} in the ONNX weight`);
+    return newTensor;
+  });
+  
+  gptLayersModel.setWeights(finalWeights); // shape or transpose mismatch will throw here
+  
+  const encoded = await serialization.model.encode(gptModel)
+  await fsPromise.mkdir(ASSET_FOLDER, { recursive: true})
+  await fsPromise.writeFile(OUTPUT_FILENAME, encoded)
+  console.log(`GPT-TFJS model saved to ${OUTPUT_FILENAME}`)
+}
+
+/**
+ * 
+ * @param tensor 
+ * @returns 
+ */
+function parseTensorData(tensor: onnx.ITensorProto): Float32Array {
+    // Check for raw data (common in larger models)
+    if (tensor.rawData && tensor.rawData.length > 0) {
+        const buffer = tensor.rawData.buffer.slice(
+            tensor.rawData.byteOffset, 
+            tensor.rawData.byteOffset + tensor.rawData.byteLength
+        );
+        if (tensor.dataType != onnx.TensorProto.DataType.FLOAT) {
+            throw new Error("found protobuf data type different from expected float 32.")
+        }
+        return new Float32Array(buffer);
+    }
+    // Fallback to specific field arrays if rawData is empty
+    console.log("WARNING: protobuf raw data is empty, falling back on specific data fields.")
+    if (tensor.floatData && tensor.floatData.length > 0) return new Float32Array(tensor.floatData);
+
+    throw new Error("protobuf raw data and float data are empty.")
+}
+
+/**
+ * Maps ONNX weight names to TFJS weight names.
+ * This mapping is specific to GPT-2 137M with 12 layers.
+ * @param prefix the TFJS model name specified in its GPTConfig, default is 'transformer'
+ */
+function createWeightNameMap(): Map<string, string> {
+  let map = Map<string, string>();
+  
+  map = map.set(`transformer.wte.weight`, `transformer/wte/embedding`);
+  map = map.set(`transformer.wpe.weight`, `transformer/wpe/embeddings`);
+  
+  Range(0, GPT2_N_LAYER).forEach(i => {
+    const onnxPrefix = `transformer.h.${i}`;
+    const tfjsPrefix = `transformer/h${i}`;
+    map = map.set(`${onnxPrefix}.ln_1.weight`, `${tfjsPrefix}/ln_1/gamma`);
+    map = map.set(`${onnxPrefix}.ln_1.bias`, `${tfjsPrefix}/ln_1/beta`);
+    map = map.set(`${onnxPrefix}.attn.c_attn.weight`, `${tfjsPrefix}/attn/c_attn/kernel`);
+    map = map.set(`${onnxPrefix}.attn.c_attn.bias`, `${tfjsPrefix}/attn/c_attn/bias`);
+    map = map.set(`${onnxPrefix}.attn.c_proj.weight`, `${tfjsPrefix}/attn/c_proj/kernel`);
+    map = map.set(`${onnxPrefix}.attn.c_proj.bias`, `${tfjsPrefix}/attn/c_proj/bias`);
+    map = map.set(`${onnxPrefix}.ln_2.weight`, `${tfjsPrefix}/ln_2/gamma`);
+    map = map.set(`${onnxPrefix}.ln_2.bias`, `${tfjsPrefix}/ln_2/beta`);
+    map = map.set(`${onnxPrefix}.mlp.c_fc.weight`, `${tfjsPrefix}/mlp/c_fc/kernel`);
+    map = map.set(`${onnxPrefix}.mlp.c_fc.bias`, `${tfjsPrefix}/mlp/c_fc/bias`);
+    map = map.set(`${onnxPrefix}.mlp.c_proj.weight`, `${tfjsPrefix}/mlp/c_proj/kernel`);
+    map = map.set(`${onnxPrefix}.mlp.c_proj.bias`, `${tfjsPrefix}/mlp/c_proj/bias`);
+  });
+
+  map = map.set(`transformer.ln_f.weight`, `transformer/ln_f/gamma`);
+  map = map.set(`transformer.ln_f.bias`, `transformer/ln_f/beta`);
+  return map;
+}
+
+
+await main().catch(console.error);