3030#include "amdgpu.h"
3131#include "amdgpu_trace.h"
3232#include "amdgpu_reset.h"
33+ #include "amdgpu_dev_coredump.h"
34+ #include "amdgpu_xgmi.h"
35+
36+ static void amdgpu_job_do_core_dump (struct amdgpu_device * adev ,
37+ struct amdgpu_job * job )
38+ {
39+ int i ;
40+
41+ dev_info (adev -> dev , "Dumping IP State\n" );
42+ for (i = 0 ; i < adev -> num_ip_blocks ; i ++ )
43+ if (adev -> ip_blocks [i ].version -> funcs -> dump_ip_state )
44+ adev -> ip_blocks [i ].version -> funcs
45+ -> dump_ip_state ((void * )adev );
46+ dev_info (adev -> dev , "Dumping IP State Completed\n" );
47+
48+ amdgpu_coredump (adev , true, false, job );
49+ }
50+
51+ static void amdgpu_job_core_dump (struct amdgpu_device * adev ,
52+ struct amdgpu_job * job )
53+ {
54+ struct list_head device_list , * device_list_handle = NULL ;
55+ struct amdgpu_device * tmp_adev = NULL ;
56+ struct amdgpu_hive_info * hive = NULL ;
57+
58+ if (!amdgpu_sriov_vf (adev ))
59+ hive = amdgpu_get_xgmi_hive (adev );
60+ if (hive )
61+ mutex_lock (& hive -> hive_lock );
62+ /*
63+ * Reuse the logic in amdgpu_device_gpu_recover() to build list of
64+ * devices for code dump
65+ */
66+ INIT_LIST_HEAD (& device_list );
67+ if (!amdgpu_sriov_vf (adev ) && (adev -> gmc .xgmi .num_physical_nodes > 1 ) && hive ) {
68+ list_for_each_entry (tmp_adev , & hive -> device_list , gmc .xgmi .head )
69+ list_add_tail (& tmp_adev -> reset_list , & device_list );
70+ if (!list_is_first (& adev -> reset_list , & device_list ))
71+ list_rotate_to_front (& adev -> reset_list , & device_list );
72+ device_list_handle = & device_list ;
73+ } else {
74+ list_add_tail (& adev -> reset_list , & device_list );
75+ device_list_handle = & device_list ;
76+ }
77+
78+ /* Do the coredump for each device */
79+ list_for_each_entry (tmp_adev , device_list_handle , reset_list )
80+ amdgpu_job_do_core_dump (tmp_adev , job );
81+
82+ if (hive ) {
83+ mutex_unlock (& hive -> hive_lock );
84+ amdgpu_put_xgmi_hive (hive );
85+ }
86+ }
3387
3488static enum drm_gpu_sched_stat amdgpu_job_timedout (struct drm_sched_job * s_job )
3589{
@@ -48,9 +102,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
48102 return DRM_GPU_SCHED_STAT_ENODEV ;
49103 }
50104
51-
52105 adev -> job_hang = true;
53106
107+ /*
108+ * Do the coredump immediately after a job timeout to get a very
109+ * close dump/snapshot/representation of GPU's current error status
110+ */
111+ amdgpu_job_core_dump (adev , job );
112+
54113 if (amdgpu_gpu_recovery &&
55114 amdgpu_ring_soft_recovery (ring , job -> vmid , s_job -> s_fence -> parent )) {
56115 dev_err (adev -> dev , "ring %s timeout, but soft recovered\n" ,
@@ -101,6 +160,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
101160 reset_context .src = AMDGPU_RESET_SRC_JOB ;
102161 clear_bit (AMDGPU_NEED_FULL_RESET , & reset_context .flags );
103162
163+ /*
164+ * To avoid an unnecessary extra coredump, as we have already
165+ * got the very close representation of GPU's error status
166+ */
167+ set_bit (AMDGPU_SKIP_COREDUMP , & reset_context .flags );
168+
104169 r = amdgpu_device_gpu_recover (ring -> adev , job , & reset_context );
105170 if (r )
106171 dev_err (adev -> dev , "GPU Recovery Failed: %d\n" , r );
0 commit comments