Skip to content

Commit aded26c

Browse files
Tvrtko Ursulinlucasdemarchi
authored andcommitted
drm/xe: Waste fewer instructions in emit_wa_job()
I was debugging some unrelated issue and noticed the current code was very verbose. We can improve it easily by using the more common batch buffer building pattern. Before: bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; c4d: 41 8b 56 10 mov 0x10(%r14),%edx c51: 49 8b 4e 08 mov 0x8(%r14),%rcx c55: 8d 72 01 lea 0x1(%rdx),%esi c58: 41 89 76 10 mov %esi,0x10(%r14) c5c: c7 04 91 01 00 08 15 movl $0x15080001,(%rcx,%rdx,4) bb->cs[bb->len++] = entry->reg.addr; c63: 8b 08 mov (%rax),%ecx c65: 41 8b 56 10 mov 0x10(%r14),%edx c69: 49 8b 76 08 mov 0x8(%r14),%rsi c6d: 81 e1 ff ff 3f 00 and $0x3fffff,%ecx c73: 8d 7a 01 lea 0x1(%rdx),%edi c76: 41 89 7e 10 mov %edi,0x10(%r14) c7a: 89 0c 96 mov %ecx,(%rsi,%rdx,4) ..etc.. After: *cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; c52: 41 c7 04 24 01 00 08 movl $0x15080001,(%r12) c59: 15 *cs++ = entry->reg.addr; c5a: 8b 10 mov (%rax),%edx ..etc.. Resulting in the following binary change: add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-348 (-348) Function old new delta xe_gt_record_default_lrcs.cold 304 296 -8 xe_gt_record_default_lrcs 2200 1860 -340 Total: Before=13554, After=13206, chg -2.57% Signed-off-by: Tvrtko Ursulin <[email protected]> Reviewed-by: Lucas De Marchi <[email protected]> Reviewed-by: Matthew Brost <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Lucas De Marchi <[email protected]>
1 parent f4b5382 commit aded26c

File tree

3 files changed

+49
-41
lines changed

3 files changed

+49
-41
lines changed

drivers/gpu/drm/xe/xe_gt.c

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
194194
unsigned long idx;
195195
struct xe_bb *bb;
196196
size_t bb_len = 0;
197+
u32 *cs;
197198

198199
/* count RMW registers as those will be handled separately */
199200
xa_for_each(&sr->xa, idx, entry) {
@@ -222,13 +223,15 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
222223
if (IS_ERR(bb))
223224
return PTR_ERR(bb);
224225

226+
cs = bb->cs;
227+
225228
if (count) {
226229
/*
227230
* Emit single LRI with all non RMW regs: 1 leading dw + 2dw per
228231
* reg + 1
229232
*/
230233

231-
bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
234+
*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
232235

233236
xa_for_each(&sr->xa, idx, entry) {
234237
struct xe_reg reg = entry->reg;
@@ -243,8 +246,8 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
243246

244247
val |= entry->set_bits;
245248

246-
bb->cs[bb->len++] = reg.addr;
247-
bb->cs[bb->len++] = val;
249+
*cs++ = reg.addr;
250+
*cs++ = val;
248251
xe_gt_dbg(gt, "REG[0x%x] = 0x%08x", reg.addr, val);
249252
}
250253
}
@@ -256,46 +259,49 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
256259
if (entry->reg.masked || entry->clr_bits == ~0)
257260
continue;
258261

259-
bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
260-
bb->cs[bb->len++] = entry->reg.addr;
261-
bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
262-
263-
bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
264-
MI_LRI_LRM_CS_MMIO;
265-
bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
266-
bb->cs[bb->len++] = entry->clr_bits;
267-
bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
268-
bb->cs[bb->len++] = entry->set_bits;
269-
270-
bb->cs[bb->len++] = MI_MATH(8);
271-
bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
272-
bb->cs[bb->len++] = CS_ALU_INSTR_LOADINV(SRCB, REG1);
273-
bb->cs[bb->len++] = CS_ALU_INSTR_AND;
274-
bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
275-
bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
276-
bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCB, REG2);
277-
bb->cs[bb->len++] = CS_ALU_INSTR_OR;
278-
bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
279-
280-
bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO;
281-
bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
282-
bb->cs[bb->len++] = entry->reg.addr;
262+
*cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
263+
*cs++ = entry->reg.addr;
264+
*cs++ = CS_GPR_REG(0, 0).addr;
265+
266+
*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
267+
MI_LRI_LRM_CS_MMIO;
268+
*cs++ = CS_GPR_REG(0, 1).addr;
269+
*cs++ = entry->clr_bits;
270+
*cs++ = CS_GPR_REG(0, 2).addr;
271+
*cs++ = entry->set_bits;
272+
273+
*cs++ = MI_MATH(8);
274+
*cs++ = CS_ALU_INSTR_LOAD(SRCA, REG0);
275+
*cs++ = CS_ALU_INSTR_LOADINV(SRCB, REG1);
276+
*cs++ = CS_ALU_INSTR_AND;
277+
*cs++ = CS_ALU_INSTR_STORE(REG0, ACCU);
278+
*cs++ = CS_ALU_INSTR_LOAD(SRCA, REG0);
279+
*cs++ = CS_ALU_INSTR_LOAD(SRCB, REG2);
280+
*cs++ = CS_ALU_INSTR_OR;
281+
*cs++ = CS_ALU_INSTR_STORE(REG0, ACCU);
282+
283+
*cs++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO;
284+
*cs++ = CS_GPR_REG(0, 0).addr;
285+
*cs++ = entry->reg.addr;
283286

284287
xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n",
285288
entry->reg.addr, entry->clr_bits, entry->set_bits);
286289
}
287290

288291
/* reset used GPR */
289-
bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) | MI_LRI_LRM_CS_MMIO;
290-
bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
291-
bb->cs[bb->len++] = 0;
292-
bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
293-
bb->cs[bb->len++] = 0;
294-
bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
295-
bb->cs[bb->len++] = 0;
292+
*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) |
293+
MI_LRI_LRM_CS_MMIO;
294+
*cs++ = CS_GPR_REG(0, 0).addr;
295+
*cs++ = 0;
296+
*cs++ = CS_GPR_REG(0, 1).addr;
297+
*cs++ = 0;
298+
*cs++ = CS_GPR_REG(0, 2).addr;
299+
*cs++ = 0;
296300
}
297301

298-
xe_lrc_emit_hwe_state_instructions(q, bb);
302+
cs = xe_lrc_emit_hwe_state_instructions(q, cs);
303+
304+
bb->len = cs - bb->cs;
299305

300306
ret = emit_job_sync(q, bb, HZ);
301307

drivers/gpu/drm/xe/xe_lrc.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,7 +1888,7 @@ static const struct instr_state xe_hpg_svg_state[] = {
18881888
{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
18891889
};
18901890

1891-
void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1891+
u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
18921892
{
18931893
struct xe_gt *gt = q->hwe->gt;
18941894
struct xe_device *xe = gt_to_xe(gt);
@@ -1923,7 +1923,7 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
19231923
if (!state_table) {
19241924
xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
19251925
GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1926-
return;
1926+
return cs;
19271927
}
19281928

19291929
for (int i = 0; i < state_table_size; i++) {
@@ -1946,12 +1946,14 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
19461946
instr == CMD_3DSTATE_DRAWING_RECTANGLE)
19471947
instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
19481948

1949-
bb->cs[bb->len] = instr;
1949+
*cs = instr;
19501950
if (!is_single_dw)
1951-
bb->cs[bb->len] |= (num_dw - 2);
1951+
*cs |= (num_dw - 2);
19521952

1953-
bb->len += num_dw;
1953+
cs += num_dw;
19541954
}
1955+
1956+
return cs;
19551957
}
19561958

19571959
struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)

drivers/gpu/drm/xe/xe_lrc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ void xe_lrc_dump_default(struct drm_printer *p,
112112
struct xe_gt *gt,
113113
enum xe_engine_class);
114114

115-
void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb);
115+
u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs);
116116

117117
struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc);
118118
void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot);

0 commit comments

Comments
 (0)