Skip to content

Commit 7f32856

Browse files
Merge pull request #899 from epfml/NAN-layers_tests-christinakopi
test GPT layers
2 parents 3a56bad + d491a97 commit 7f32856

File tree

3 files changed

+369
-117
lines changed

3 files changed

+369
-117
lines changed
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
import * as tf from '@tensorflow/tfjs';
2+
import { expect } from 'chai';
3+
import { GELU, LMEmbedding, Range, MLP, MLPConfig, CausalSelfAttention, CausalSelfAttentionConfig } from './layers.js';
4+
5+
6+
describe('GPT Layers', function () {
7+
// GELU Layer tests
8+
describe('GELU Layer', function () {
9+
10+
afterEach(() => {
11+
// Dispose of variables to avoid name collisions in subsequent tests.
12+
tf.disposeVariables();
13+
});
14+
15+
it('should compute GELU activation correctly for known inputs', async function () {
16+
const geluLayer = new GELU();
17+
18+
const input = tf.tensor1d([0, 1, -1, 2, -2]);
19+
20+
const output = geluLayer.apply(input) as tf.Tensor;
21+
const outputData = await output.data();
22+
23+
// expected values based on the GELU tanh approximation
24+
const expected: number[] = [0, 0.8412, -0.1588, 1.955, -0.045];
25+
26+
for (let i = 0; i < expected.length; i++) {
27+
expect(outputData[i]).to.be.closeTo(expected[i], 0.05);
28+
}
29+
});
30+
});
31+
32+
// LMEmbedding Layer tests
33+
describe('LMEmbedding Layer', function () {
34+
35+
it('should return token embeddings with shape [batch_size, sequence_length, nEmbd] for 2D input', function () {
36+
const vocabSize = 100;
37+
const nEmbd = 16;
38+
const seed = 42;
39+
40+
const lmEmbedding = new LMEmbedding(vocabSize, nEmbd, seed);
41+
42+
// dummy 2D input representing token indices: shape [batch_size, sequence_length]
43+
const tokenIndices = tf.randomUniformInt([2, 5], 0, 1);
44+
45+
const output = lmEmbedding.apply(tokenIndices) as tf.Tensor;
46+
47+
// expected output shape for 2D input: [2, 5, nEmbd]
48+
expect(output.shape).to.deep.equal([2, 5, nEmbd]);
49+
});
50+
51+
it("should work for 2D & 3D inputs", () => {
52+
const vocabSize = 100;
53+
const nEmbd = 16;
54+
const seed = 42;
55+
56+
const lmEmbedding = new LMEmbedding(vocabSize, nEmbd, seed);
57+
58+
const tokenIndices = tf.randomUniformInt([2, 5], 0, 1);
59+
const embeddingsInput = tf.randomUniform([2, 5, nEmbd]);
60+
const outputForToken = lmEmbedding.apply(tokenIndices) as tf.Tensor;
61+
const outputForEmbedding = lmEmbedding.apply(embeddingsInput) as tf.Tensor;
62+
63+
expect(outputForToken.shape).to.deep.equal([2, 5, nEmbd]);
64+
expect(outputForEmbedding.shape).to.deep.equal([2, 5, vocabSize]);
65+
});
66+
67+
it('should throw appropriate errors for invalid input shapes', function () {
68+
const vocabSize = 100;
69+
const nEmbd = 16;
70+
const seed = 42;
71+
const lmEmbedding = new LMEmbedding(vocabSize, nEmbd, seed);
72+
73+
// Case 1: 1D tensor input
74+
const invalidInput = tf.tensor1d([1, 2, 3], 'int32');
75+
expect(() => lmEmbedding.apply(invalidInput)).to.throw('unexpected input shape');
76+
77+
// Case 2: array with more than one tensor
78+
const input1 = tf.tensor2d([[1, 2, 3]], [1, 3], 'int32');
79+
const input2 = tf.tensor2d([[4, 5, 6]], [1, 3], 'int32');
80+
expect(() => lmEmbedding.apply([input1, input2])).to.throw('expected exactly one tensor');
81+
});
82+
83+
it('should compute correct output shape for 2D input using computeOutputShape', function () {
84+
const vocabSize = 100;
85+
const nEmbd = 16;
86+
const seed = 42;
87+
const lmEmbedding = new LMEmbedding(vocabSize, nEmbd, seed);
88+
const outputShape = lmEmbedding.computeOutputShape([null, null]);
89+
expect(outputShape).to.deep.equal([null, null, nEmbd]);
90+
});
91+
92+
});
93+
94+
// Range Layer tests
95+
describe('Range Layer', function () {
96+
97+
afterEach(() => {
98+
// dispose any created tensors/variables
99+
tf.disposeVariables();
100+
});
101+
102+
it('should output a tensor with shape [1, T] for an input of shape [batch, T]', async function () {
103+
const rangeLayer = new Range();
104+
105+
// dummy input tensor with shape [batch, T]
106+
const dummyInput = tf.zeros([3, 10], 'int32');
107+
108+
const output = rangeLayer.apply(dummyInput) as tf.Tensor;
109+
110+
// We expect the output to have shape [1, T] i.e. [1, 10]
111+
expect(output.shape).to.deep.equal([1, 10]);
112+
113+
// verify the content: the layer should output a range [0, 1, ..., T-1]
114+
expect(await output.data()).to.deep.equal(
115+
Int32Array.of(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
116+
);
117+
});
118+
});
119+
120+
// MLP Layer tests
121+
describe('MLP Layer', function () {
122+
123+
it('should produce deterministic/non-NaN outputs with the same random seed', async function () {
124+
// an MLP config with a fixed seed
125+
const config: MLPConfig = {
126+
name: 'testMLP',
127+
contextLength: 10,
128+
residDrop: 0, // no dropout for deterministic behavior
129+
nLayer: 2,
130+
seed: 42,
131+
nEmbd: 16,
132+
nHead: 4
133+
};
134+
135+
// two separate MLP model instances using the same config
136+
const model1 = MLP(config);
137+
const model2 = MLP(config);
138+
139+
const input = tf.ones([1, config.contextLength, config.nEmbd]);
140+
141+
// get predictions from both models
142+
const output1 = model1.predict(input) as tf.Tensor;
143+
const output2 = model2.predict(input) as tf.Tensor;
144+
145+
const arr1 = await output1.data();
146+
const arr2 = await output2.data();
147+
148+
// check that the models produce the same output
149+
expect(arr1).to.deep.equal(arr2);
150+
151+
// Check that there are no NaN values in the outputs.
152+
for (const v of arr1) {
153+
expect(v).to.not.be.NaN;
154+
}
155+
for (const v of arr2) {
156+
expect(v).to.not.be.NaN;
157+
}
158+
159+
});
160+
});
161+
162+
// CausalSelfAttention Layer tests
163+
describe('CausalSelfAttention Helper Methods', function () {
164+
165+
const config: CausalSelfAttentionConfig = {
166+
name: 'testCSA',
167+
contextLength: 5,
168+
nHead: 2,
169+
nEmbd: 8, // divisible by nHead, so head size = 4
170+
dropout: 0.0, // no dropout for deterministic tests
171+
nLayer: 2,
172+
seed: 42
173+
};
174+
175+
let csa: CausalSelfAttention;
176+
177+
// new instance of CausalSelfAttention before each test
178+
beforeEach(() => {
179+
csa = new CausalSelfAttention(config);
180+
// dummy input has shape [batch, T, nEmbd] = [1, contextLength, nEmbd].
181+
const dummyInput = tf.zeros([1, config.contextLength, config.nEmbd], 'float32');
182+
csa.apply(dummyInput);
183+
});
184+
185+
afterEach(() => {
186+
tf.disposeVariables();
187+
});
188+
189+
190+
describe('splitHeads', function () {
191+
it('should reshape and transpose the input correctly', function () {
192+
const B = 2;
193+
const T = 6;
194+
const totalChannels = config.nEmbd; // 8 channels
195+
// input tensor with shape [B, T, totalChannels]
196+
const input = tf.ones([B, T, totalChannels]);
197+
const output = csa.splitHeads(input, B, T, config.nHead);
198+
// expected shape: [B, nHead, T, totalChannels/nHead] = [2, 2, 6, 4]
199+
expect(output.shape).to.deep.equal([B, config.nHead, T, totalChannels / config.nHead]);
200+
});
201+
});
202+
203+
describe('applyCausalMask', function () {
204+
it('should produce a causal mask that sets upper-triangular positions to -1e9', async function () {
205+
const T = config.contextLength;
206+
// dummy attention logits tensor with shape [1, 1, T, T] filled with zeros
207+
const att = tf.zeros([1, 1, T, T], 'float32');
208+
const masked = csa.applyCausalMask(att, T);
209+
const data = await masked.data();
210+
// for each position (i,j): if j > i expect -1e9 else 0
211+
const expected = [
212+
[0, 1, 1, 1, 1],
213+
[0, 0, 1, 1, 1],
214+
[0, 0, 0, 1, 1],
215+
[0, 0, 0, 0, 1],
216+
[0, 0, 0, 0, 0],
217+
]
218+
.flat()
219+
.map((v) => (v === 0 ? 0 : -1e9));
220+
221+
expect(Array.from(data)).to.deep.equal(expected);
222+
});
223+
});
224+
225+
describe('computeAttention', function () {
226+
it('should output attention weights that sum to 1 over the last dimension', async function () {
227+
const B = 1;
228+
const nHead = config.nHead;
229+
const T = config.contextLength;
230+
const headSize = config.nEmbd / config.nHead;
231+
const q = tf.randomUniform([B, nHead, T, headSize]);
232+
const k = tf.randomUniform([B, nHead, T, headSize]);
233+
const att = csa.computeAttention(q, k, false, T);
234+
// expected shape: [B, nHead, T, T]
235+
expect(att.shape).to.deep.equal([B, nHead, T, T]);
236+
// check that each row of the attention logits (last dimension) sums to approximately 1
237+
for (const rowSum of await att.sum(-1).data()) {
238+
expect(rowSum).to.be.closeTo(1, 1e-3);
239+
}
240+
});
241+
});
242+
});
243+
244+
});

0 commit comments

Comments
 (0)