Skip to content

Commit f767983

Browse files
noahgiftclaude
andcommitted
perf: use bfi.b32 for unaligned Q6K loads — 24 fewer instructions per super-block (Refs GH-131)
Replace shl+or byte assembly in ld_global_u32_unaligned with bfi.b32 bit-field insert. Each unaligned u32 load saves 6 instructions (9 → 3 for the packing step). With 4 unaligned loads per Q6K super-block, this reduces instruction overhead by 24 per super-block on sm_87 Jetson Orin. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c4d2bea commit f767983

File tree

4 files changed

+42
-13
lines changed

4 files changed

+42
-13
lines changed

.pmat/baseline.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"version": "3.6.1",
3-
"created_at": "2026-03-05T18:28:04.491036699Z",
3+
"created_at": "2026-03-05T22:38:52.308212548Z",
44
"git_context": null,
55
"files": {},
66
"summary": {

trueno-gpu/src/kernels/quantize/q6k/dp4a.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ mod tests {
376376
assert!(ptx.contains(".visible .entry dp4a_q6k_gemv"));
377377
assert!(ptx.contains("dp4a.u32.s32"), "Must use dp4a instructions");
378378
assert!(ptx.contains("bar.sync"), "Must have barrier for cross-warp safety");
379+
// GH-131: bfi.b32 used for unaligned Q6K loads (replaces shl+or assembly)
380+
assert!(ptx.contains("bfi.b32"), "Must use bfi.b32 for unaligned byte packing");
379381
}
380382

381383
#[test]

trueno-gpu/src/ptx/builder/global_mem.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use crate::ptx::instructions::{Operand, PtxInstruction, PtxOp};
77
use crate::ptx::registers::VirtualReg;
88
use crate::ptx::types::{PtxStateSpace, PtxType};
99

10-
use super::{KernelBuilder, PtxArithmetic, PtxControl};
10+
use super::{KernelBuilder, PtxArithmetic};
1111

1212
impl<'a> KernelBuilder<'a> {
1313
// ===== Memory Operations (vectorized - not in traits) =====
@@ -154,12 +154,16 @@ impl<'a> KernelBuilder<'a> {
154154

155155
/// Load u32 from potentially unaligned global memory address.
156156
///
157-
/// Uses 4 byte loads + shifts to assemble a u32, avoiding
157+
/// Uses 4 byte loads + `bfi.b32` to assemble a u32, avoiding
158158
/// `ld.global.u32` alignment requirements (4-byte aligned).
159159
/// Required for Q6K super-blocks (210 bytes each, not 4-byte aligned).
160160
///
161161
/// sm_87 (Jetson Orin) faults on misaligned ld.global.u32 with
162162
/// CUDA_ERROR_MISALIGNED_ADDRESS (716).
163+
///
164+
/// GH-131: Optimized from shl+or (9 instructions) to bfi.b32 (3 instructions)
165+
/// for the byte assembly step. Saves 6 instructions per call × 4 calls per
166+
/// Q6K super-block = 24 fewer instructions per super-block.
163167
pub fn ld_global_u32_unaligned(&mut self, addr: VirtualReg) -> VirtualReg {
164168
// Load 4 consecutive bytes
165169
let b0 = self.ld_global_u8(addr);
@@ -173,20 +177,17 @@ impl<'a> KernelBuilder<'a> {
173177
let addr3 = self.add_u64(addr, off3);
174178
let b3 = self.ld_global_u8(addr3);
175179

176-
// Convert u8 (in u16 registers) to u32 and assemble little-endian
180+
// Convert u8 (in u16 registers) to u32
177181
let w0 = self.cvt_u32_u8(b0); // byte 0 → bits [7:0]
178182
let w1 = self.cvt_u32_u8(b1);
179183
let w2 = self.cvt_u32_u8(b2);
180184
let w3 = self.cvt_u32_u8(b3);
181-
let eight = self.mov_u32_imm(8);
182-
let sixteen = self.mov_u32_imm(16);
183-
let twentyfour = self.mov_u32_imm(24);
184-
let s1 = self.shl_u32(w1, eight); // byte 1 → bits [15:8]
185-
let s2 = self.shl_u32(w2, sixteen); // byte 2 → bits [23:16]
186-
let s3 = self.shl_u32(w3, twentyfour); // byte 3 → bits [31:24]
187-
let t01 = self.or_u32(w0, s1);
188-
let t23 = self.or_u32(s2, s3);
189-
self.or_u32(t01, t23)
185+
186+
// Assemble little-endian u32 using bfi.b32 (3 instructions vs 9 with shl+or)
187+
// bfi.b32 inserts `len` bits from `insert` into `base` at position `start`
188+
let t1 = self.bfi_b32(w1, w0, 8, 8); // insert byte 1 at bits [15:8]
189+
let t2 = self.bfi_b32(w2, t1, 16, 8); // insert byte 2 at bits [23:16]
190+
self.bfi_b32(w3, t2, 24, 8) // insert byte 3 at bits [31:24]
190191
}
191192

192193
/// Load u16 from global memory (for f16 as raw bits)

trueno-gpu/src/ptx/builder/warp_vote.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,32 @@ impl<'a> KernelBuilder<'a> {
167167
dst
168168
}
169169

170+
/// Bit field insert: insert `len` bits from `insert` into `base` at position `start`
171+
///
172+
/// PTX: `bfi.b32 dst, insert, base, start, len;`
173+
/// dst = base with bits [start..start+len-1] replaced by insert[0..len-1]
174+
///
175+
/// GH-131: Used to pack bytes into u32 for unaligned Q6K loads on sm_87.
176+
/// Replaces 3 instructions (mov+shl+or) with 1 instruction per byte insertion.
177+
pub fn bfi_b32(
178+
&mut self,
179+
insert: VirtualReg,
180+
base: VirtualReg,
181+
start: u32,
182+
len: u32,
183+
) -> VirtualReg {
184+
let dst = self.registers.allocate_virtual(PtxType::U32);
185+
self.instructions.push(
186+
PtxInstruction::new(PtxOp::Bfi, PtxType::B32)
187+
.dst(Operand::Reg(dst))
188+
.src(Operand::Reg(insert))
189+
.src(Operand::Reg(base))
190+
.src(Operand::ImmI64(start as i64))
191+
.src(Operand::ImmI64(len as i64)),
192+
);
193+
dst
194+
}
195+
170196
/// Load f32 immediate constant
171197
///
172198
/// PAR-062: Used for NEG_INFINITY initialization

0 commit comments

Comments
 (0)