@@ -540,26 +540,29 @@ enum kfd_smi_event {
540
540
KFD_SMI_EVENT_ALL_PROCESS = 64
541
541
};
542
542
543
+ /* The reason of the page migration event */
543
544
enum KFD_MIGRATE_TRIGGERS {
544
- KFD_MIGRATE_TRIGGER_PREFETCH ,
545
- KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU ,
546
- KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU ,
547
- KFD_MIGRATE_TRIGGER_TTM_EVICTION
545
+ KFD_MIGRATE_TRIGGER_PREFETCH , /* Prefetch to GPU VRAM or system memory */
546
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU , /* GPU page fault recover */
547
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU , /* CPU page fault recover */
548
+ KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
548
549
};
549
550
551
+ /* The reason of user queue evition event */
550
552
enum KFD_QUEUE_EVICTION_TRIGGERS {
551
- KFD_QUEUE_EVICTION_TRIGGER_SVM ,
552
- KFD_QUEUE_EVICTION_TRIGGER_USERPTR ,
553
- KFD_QUEUE_EVICTION_TRIGGER_TTM ,
554
- KFD_QUEUE_EVICTION_TRIGGER_SUSPEND ,
555
- KFD_QUEUE_EVICTION_CRIU_CHECKPOINT ,
556
- KFD_QUEUE_EVICTION_CRIU_RESTORE
553
+ KFD_QUEUE_EVICTION_TRIGGER_SVM , /* SVM buffer migration */
554
+ KFD_QUEUE_EVICTION_TRIGGER_USERPTR , /* userptr movement */
555
+ KFD_QUEUE_EVICTION_TRIGGER_TTM , /* TTM move buffer */
556
+ KFD_QUEUE_EVICTION_TRIGGER_SUSPEND , /* GPU suspend */
557
+ KFD_QUEUE_EVICTION_CRIU_CHECKPOINT , /* CRIU checkpoint */
558
+ KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
557
559
};
558
560
561
+ /* The reason of unmap buffer from GPU event */
559
562
enum KFD_SVM_UNMAP_TRIGGERS {
560
- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY ,
561
- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE ,
562
- KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
563
+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY , /* MMU notifier CPU buffer movement */
564
+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE ,/* MMU notifier page migration */
565
+ KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
563
566
};
564
567
565
568
#define KFD_SMI_EVENT_MASK_FROM_INDEX (i ) (1ULL << ((i) - 1))
@@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
570
573
__u32 anon_fd ; /* from KFD */
571
574
};
572
575
576
+ /*
577
+ * SVM event tracing via SMI system management interface
578
+ *
579
+ * Open event file descriptor
580
+ * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
581
+ * descriptor to receive SMI events.
582
+ * If calling with sudo permission, then file descriptor can be used to receive
583
+ * SVM events from all processes, otherwise, to only receive SVM events of same
584
+ * process.
585
+ *
586
+ * To enable the SVM event
587
+ * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
588
+ * mask to start record the event to the kfifo, use bitmap mask combination
589
+ * for multiple events. New event mask will overwrite the previous event mask.
590
+ * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
591
+ * permisson to receive SVM events from all process.
592
+ *
593
+ * To receive the event
594
+ * Application can poll file descriptor to wait for the events, then read event
595
+ * from the file into a buffer. Each event is one line string message, starting
596
+ * with the event id, then the event specific information.
597
+ *
598
+ * To decode event information
599
+ * The following event format string macro can be used with sscanf to decode
600
+ * the specific event information.
601
+ * event triggers: the reason to generate the event, defined as enum for unmap,
602
+ * eviction and migrate events.
603
+ * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
604
+ * addr: user mode address, in pages
605
+ * size: in pages
606
+ * pid: the process ID to generate the event
607
+ * ns: timestamp in nanosecond-resolution, starts at system boot time but
608
+ * stops during suspend
609
+ * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
610
+ * rw: 'W' for write page fault, 'R' for read page fault
611
+ * rescheduled: 'R' if the queue restore failed and rescheduled to try again
612
+ */
613
+ #define KFD_EVENT_FMT_UPDATE_GPU_RESET (reset_seq_num , reset_cause )\
614
+ "%x %s\n", (reset_seq_num), (reset_cause)
615
+
616
+ #define KFD_EVENT_FMT_THERMAL_THROTTLING (bitmask , counter )\
617
+ "%llx:%llx\n", (bitmask), (counter)
618
+
619
+ #define KFD_EVENT_FMT_VMFAULT (pid , task_name )\
620
+ "%x:%s\n", (pid), (task_name)
621
+
622
+ #define KFD_EVENT_FMT_PAGEFAULT_START (ns , pid , addr , node , rw )\
623
+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
624
+
625
+ #define KFD_EVENT_FMT_PAGEFAULT_END (ns , pid , addr , node , migrate_update )\
626
+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
627
+
628
+ #define KFD_EVENT_FMT_MIGRATE_START (ns , pid , start , size , from , to , prefetch_loc ,\
629
+ preferred_loc , migrate_trigger )\
630
+ "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
631
+ (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
632
+
633
+ #define KFD_EVENT_FMT_MIGRATE_END (ns , pid , start , size , from , to , migrate_trigger )\
634
+ "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
635
+ (from), (to), (migrate_trigger)
636
+
637
+ #define KFD_EVENT_FMT_QUEUE_EVICTION (ns , pid , node , evict_trigger )\
638
+ "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
639
+
640
+ #define KFD_EVENT_FMT_QUEUE_RESTORE (ns , pid , node , rescheduled )\
641
+ "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
642
+
643
+ #define KFD_EVENT_FMT_UNMAP_FROM_GPU (ns , pid , addr , size , node , unmap_trigger )\
644
+ "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
645
+ (node), (unmap_trigger)
646
+
573
647
/**************************************************************************************************
574
648
* CRIU IOCTLs (Checkpoint Restore In Userspace)
575
649
*
0 commit comments