@@ -51,17 +51,14 @@ protected TaskGraph configureLayerDataTransfers(TaskGraph unifiedLayer, int laye
51
51
context , state .wrapXb , state .wrapXb2 , //
52
52
state .wrapQ , state .wrapK , state .wrapV , //
53
53
state .wrapKeyCache , state .wrapValueCache , //
54
- state .wrapAtt , state .wrapHb );//,
55
- // dbg buffers
56
- //state.dbgQ, state.dbgKeyCache, state.dbgValueCache, state.dbgXb, state.dbgX); //
54
+ state .wrapAtt , state .wrapHb );//
57
55
} else {
58
56
// Subsequent layers: Consume data already on device from previous layer
59
57
unifiedLayer .consumeFromDevice (context , state .wrapXb , state .wrapXb2 , //
60
58
state .wrapQ , state .wrapK , state .wrapV , //
61
59
state .wrapKeyCache , state .wrapValueCache , //
62
60
state .wrapAtt , state .wrapHb , //
63
61
state .positionHolder //
64
- //state.dbgQ, state.dbgKeyCache, state.dbgValueCache, state.dbgXb, state.dbgX
65
62
);
66
63
}
67
64
return unifiedLayer ;
@@ -76,10 +73,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
76
73
state .tempLogits .init (0.0f );
77
74
state .wrapLogits .init (0.0f );
78
75
79
- // state.dbgQ.init(0.0f);
80
- // state.dbgKeyCache.init(0.0f);
81
- // state.dbgValueCache.init(0.0f);
82
-
83
76
// @formatter:off
84
77
TaskGraph activationUpdate = new TaskGraph ("activationUpdate" )
85
78
.transferToDevice (DataTransferMode .EVERY_EXECUTION , state .wrapX )
@@ -108,12 +101,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
108
101
weights .w3Layered [layerIndex ]
109
102
);
110
103
unifiedLayer = configureLayerDataTransfers (unifiedLayer , layerIndex );
111
- // unifiedLayer.task("dbg_copy_out_x",
112
- // Qwen3Kernels::dbgCopy,
113
- // state.wrapX,
114
- // state.dbgX,
115
- // state.positionHolder,
116
- // layerIndex);
117
104
unifiedLayer .task ("reductionsOneBlock" ,
118
105
TransformerComputeKernelsLayered ::reductionOneBlockWithLayer ,
119
106
context ,
@@ -170,13 +157,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
170
157
kvDim0 ,
171
158
LOCAL_WORK_GROUP_SIZE_ALLOC );
172
159
173
- // unifiedLayer.task("dbg_copy_out_wrapQ",
174
- // Qwen3Kernels::dbgCopy,
175
- // state.wrapQ,
176
- // state.dbgQ,
177
- // state.positionHolder,
178
- // layerIndex);
179
-
180
160
// dbg copy out
181
161
// unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapQ);
182
162
// unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapK);
@@ -205,13 +185,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
205
185
weights .rms_att_QNormLayered [layerIndex ],
206
186
nEmbdHead ,
207
187
state .tempQcur );
208
-
209
- // unifiedLayer.task("dbg_copy_out_wrapQ",
210
- // Qwen3Kernels::dbgCopy,
211
- // state.wrapQ,
212
- // state.dbgQ,
213
- // state.positionHolder,
214
- // layerIndex);
215
188
// unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapQ);
216
189
// unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapK);
217
190
//
@@ -253,13 +226,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
253
226
config .numberOfKeyValueHeads (),
254
227
nEmbdHead );
255
228
256
- // unifiedLayer.task("dbg_copy_out_wrapQ",
257
- // Qwen3Kernels::dbgCopy,
258
- // state.wrapQ,
259
- // state.dbgQ,
260
- // state.positionHolder,
261
- // layerIndex);
262
-
263
229
// dbg copy out
264
230
//unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapQ);
265
231
//unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapK);
@@ -275,27 +241,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
275
241
layerIndex ,
276
242
config .contextLength ());
277
243
278
- // unifiedLayer.task("dbg_copy_out_q",
279
- // Qwen3Kernels::dbgCopy,
280
- // state.wrapQ,
281
- // state.dbgQ,
282
- // state.positionHolder,
283
- // layerIndex);
284
- //
285
- // unifiedLayer.task("dbg_copy_out_keyCache",
286
- // Qwen3Kernels::dbgCopy,
287
- // state.wrapKeyCache,
288
- // state.dbgKeyCache,
289
- // state.positionHolder,
290
- // layerIndex);
291
- //
292
- // unifiedLayer.task("dbg_copy_out_ValueCache",
293
- // Qwen3Kernels::dbgCopy,
294
- // state.wrapValueCache,
295
- // state.dbgValueCache,
296
- // state.positionHolder,
297
- // layerIndex);
298
-
299
244
// global size = numberOfHeads * 8 = 16 * 8 = 128
300
245
unifiedLayer .task ("parallel-attention" ,
301
246
TransformerComputeKernelsLayered ::processHeadsFlashAttentionOpt ,
@@ -312,20 +257,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
312
257
layerIndex ,
313
258
config .contextLength ());
314
259
315
- // unifiedLayer.task("dbg_copy_out_x",
316
- // Qwen3Kernels::dbgCopy,
317
- // state.wrapX,
318
- // state.dbgX,
319
- // state.positionHolder,
320
- // layerIndex);
321
- //
322
- // unifiedLayer.task("dbg_copy_out_xb",
323
- // Qwen3Kernels::dbgCopy,
324
- // state.wrapXb,
325
- // state.dbgXb,
326
- // state.positionHolder,
327
- // layerIndex);
328
-
329
260
//unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapXb);
330
261
unifiedLayer .task ("matmul1" , Qwen3Kernels ::matrixVectorGenericWithResidual ,
331
262
context ,
@@ -336,13 +267,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
336
267
config .dim (), // dim0 = 1024
337
268
LOCAL_WORK_GROUP_SIZE_ALLOC );
338
269
339
- // unifiedLayer.task("dbg_copy_out_x",
340
- // Qwen3Kernels::dbgCopy,
341
- // state.wrapX,
342
- // state.dbgX,
343
- // state.positionHolder,
344
- // layerIndex);
345
-
346
270
//unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapX);
347
271
unifiedLayer .task ("reductionsOneBlockFFN" , TransformerComputeKernelsLayered ::reductionOneBlockWithLayer ,
348
272
context , state .tempFFN , state .wrapX , config .dim (), config .rmsNormEps (), state .localSize )
@@ -351,22 +275,11 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
351
275
.task ("mapContextFFN" , TransformerComputeKernelsLayered ::reductionOneBlock2WithLayer , context , state .wrapXb ,
352
276
state .wrapX , weights .rms_ffn_weightLayered [layerIndex ], state .tempFFN );
353
277
354
- // unifiedLayer.task("dbg_copy_out_xb",
355
- // Qwen3Kernels::dbgCopy,
356
- // state.wrapXb,
357
- // state.dbgXb,
358
- // state.positionHolder,
359
- // layerIndex);
360
-
361
278
unifiedLayer .task ("fused_ffn_w1_w3" , TransformerComputeKernelsLayered ::fusedFeedForwardWithSiLUAndGLUActivation , context ,
362
279
state .wrapXb , state .wrapHb , weights .w1Layered [layerIndex ], weights .w3Layered [layerIndex ], config .dim (), config .hiddenDim (), LOCAL_WORK_GROUP_SIZE_ALLOC )
363
280
.task ("projectionTwo" , TransformerComputeKernelsLayered ::matrixVectorGenericWithResidual , context ,
364
281
state .wrapHb , state .wrapX , weights .w2Layered [layerIndex ], config .hiddenDim (), config .dim (), LOCAL_WORK_GROUP_SIZE_ALLOC )
365
282
//.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapX)
366
- // dbg copy out
367
- //.transferToHost(DataTransferMode.EVERY_EXECUTION, state.dbgQ, state.dbgKeyCache, state.dbgValueCache)
368
- //.transferToHost(DataTransferMode.EVERY_EXECUTION, state.dbgX)//, state.dbgXb)
369
- //.transferToHost(DataTransferMode.EVERY_EXECUTION, state.dbgValueCache)
370
283
.persistOnDevice (
371
284
state .wrapX
372
285
);
0 commit comments