Skip to content

Commit 663b0f1

Browse files
PhilipYangAalexdeucher
authored andcommitted
drm/amdkfd: Document and define SVM events message macro
Document how to use SMI system management interface to enable and receive SVM events. Document SVM event triggers. Define SVM events message string format macro that could be used by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang <[email protected]> Reviewed-by: James Zhu <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 7eafe7a commit 663b0f1

File tree

2 files changed

+109
-36
lines changed

2 files changed

+109
-36
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
235235
amdgpu_reset_get_desc(reset_context, reset_cause,
236236
sizeof(reset_cause));
237237

238-
kfd_smi_event_add(0, dev, event, "%x %s\n",
239-
dev->reset_seq_num,
240-
reset_cause);
238+
kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
239+
dev->reset_seq_num, reset_cause));
241240
}
242241

243242
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
244243
uint64_t throttle_bitmask)
245244
{
246-
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
245+
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING(
247246
throttle_bitmask,
248-
amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
247+
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
249248
}
250249

251250
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
@@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
256255
if (task_info) {
257256
/* Report VM faults from user applications, not retry from kernel */
258257
if (task_info->pid)
259-
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
260-
task_info->pid, task_info->task_name);
258+
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
259+
task_info->pid, task_info->task_name));
261260
amdgpu_vm_put_task_info(task_info);
262261
}
263262
}
@@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
267266
ktime_t ts)
268267
{
269268
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
270-
"%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
271-
address, node->id, write_fault ? 'W' : 'R');
269+
KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
270+
address, node->id, write_fault ? 'W' : 'R'));
272271
}
273272

274273
void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
275274
unsigned long address, bool migration)
276275
{
277276
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
278-
"%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
279-
pid, address, node->id, migration ? 'M' : 'U');
277+
KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
278+
pid, address, node->id, migration ? 'M' : 'U'));
280279
}
281280

282281
void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
@@ -286,34 +285,34 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
286285
uint32_t trigger)
287286
{
288287
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
289-
"%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
288+
KFD_EVENT_FMT_MIGRATE_START(
290289
ktime_get_boottime_ns(), pid, start, end - start,
291-
from, to, prefetch_loc, preferred_loc, trigger);
290+
from, to, prefetch_loc, preferred_loc, trigger));
292291
}
293292

294293
void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
295294
unsigned long start, unsigned long end,
296295
uint32_t from, uint32_t to, uint32_t trigger)
297296
{
298297
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
299-
"%lld -%d @%lx(%lx) %x->%x %d\n",
298+
KFD_EVENT_FMT_MIGRATE_END(
300299
ktime_get_boottime_ns(), pid, start, end - start,
301-
from, to, trigger);
300+
from, to, trigger));
302301
}
303302

304303
void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
305304
uint32_t trigger)
306305
{
307306
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
308-
"%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
309-
node->id, trigger);
307+
KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid,
308+
node->id, trigger));
310309
}
311310

312311
void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
313312
{
314313
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
315-
"%lld -%d %x\n", ktime_get_boottime_ns(), pid,
316-
node->id);
314+
KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
315+
node->id, 0));
317316
}
318317

319318
void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
@@ -330,8 +329,8 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
330329

331330
kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
332331
KFD_SMI_EVENT_QUEUE_RESTORE,
333-
"%lld -%d %x %c\n", ktime_get_boottime_ns(),
334-
p->lead_thread->pid, pdd->dev->id, 'R');
332+
KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
333+
p->lead_thread->pid, pdd->dev->id, 'R'));
335334
}
336335
kfd_unref_process(p);
337336
}
@@ -341,8 +340,8 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
341340
uint32_t trigger)
342341
{
343342
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU,
344-
"%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
345-
pid, address, last - address + 1, node->id, trigger);
343+
KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
344+
pid, address, last - address + 1, node->id, trigger));
346345
}
347346

348347
int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)

include/uapi/linux/kfd_ioctl.h

Lines changed: 87 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -540,26 +540,29 @@ enum kfd_smi_event {
540540
KFD_SMI_EVENT_ALL_PROCESS = 64
541541
};
542542

543+
/* The reason of the page migration event */
543544
enum KFD_MIGRATE_TRIGGERS {
544-
KFD_MIGRATE_TRIGGER_PREFETCH,
545-
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
546-
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
547-
KFD_MIGRATE_TRIGGER_TTM_EVICTION
545+
KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU VRAM or system memory */
546+
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */
547+
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */
548+
KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
548549
};
549550

551+
/* The reason of user queue evition event */
550552
enum KFD_QUEUE_EVICTION_TRIGGERS {
551-
KFD_QUEUE_EVICTION_TRIGGER_SVM,
552-
KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
553-
KFD_QUEUE_EVICTION_TRIGGER_TTM,
554-
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
555-
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
556-
KFD_QUEUE_EVICTION_CRIU_RESTORE
553+
KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */
554+
KFD_QUEUE_EVICTION_TRIGGER_USERPTR, /* userptr movement */
555+
KFD_QUEUE_EVICTION_TRIGGER_TTM, /* TTM move buffer */
556+
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, /* GPU suspend */
557+
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, /* CRIU checkpoint */
558+
KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
557559
};
558560

561+
/* The reason of unmap buffer from GPU event */
559562
enum KFD_SVM_UNMAP_TRIGGERS {
560-
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
561-
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
562-
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
563+
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */
564+
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */
565+
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
563566
};
564567

565568
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
@@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
570573
__u32 anon_fd; /* from KFD */
571574
};
572575

576+
/*
577+
* SVM event tracing via SMI system management interface
578+
*
579+
* Open event file descriptor
580+
* use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
581+
* descriptor to receive SMI events.
582+
* If calling with sudo permission, then file descriptor can be used to receive
583+
* SVM events from all processes, otherwise, to only receive SVM events of same
584+
* process.
585+
*
586+
* To enable the SVM event
587+
* Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
588+
* mask to start record the event to the kfifo, use bitmap mask combination
589+
* for multiple events. New event mask will overwrite the previous event mask.
590+
* KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
591+
* permisson to receive SVM events from all process.
592+
*
593+
* To receive the event
594+
* Application can poll file descriptor to wait for the events, then read event
595+
* from the file into a buffer. Each event is one line string message, starting
596+
* with the event id, then the event specific information.
597+
*
598+
* To decode event information
599+
* The following event format string macro can be used with sscanf to decode
600+
* the specific event information.
601+
* event triggers: the reason to generate the event, defined as enum for unmap,
602+
* eviction and migrate events.
603+
* node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
604+
* addr: user mode address, in pages
605+
* size: in pages
606+
* pid: the process ID to generate the event
607+
* ns: timestamp in nanosecond-resolution, starts at system boot time but
608+
* stops during suspend
609+
* migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
610+
* rw: 'W' for write page fault, 'R' for read page fault
611+
* rescheduled: 'R' if the queue restore failed and rescheduled to try again
612+
*/
613+
#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
614+
"%x %s\n", (reset_seq_num), (reset_cause)
615+
616+
#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\
617+
"%llx:%llx\n", (bitmask), (counter)
618+
619+
#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
620+
"%x:%s\n", (pid), (task_name)
621+
622+
#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
623+
"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
624+
625+
#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
626+
"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
627+
628+
#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
629+
preferred_loc, migrate_trigger)\
630+
"%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
631+
(from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
632+
633+
#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
634+
"%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
635+
(from), (to), (migrate_trigger)
636+
637+
#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
638+
"%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
639+
640+
#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\
641+
"%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
642+
643+
#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
644+
"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
645+
(node), (unmap_trigger)
646+
573647
/**************************************************************************************************
574648
* CRIU IOCTLs (Checkpoint Restore In Userspace)
575649
*

0 commit comments

Comments
 (0)