Skip to content

Commit d73b1d0

Browse files
committed
drm/msm: Hangcheck progress detection
If the hangcheck timer expires, check if the fw's position in the cmdstream has advanced (changed) since last timer expiration, and allow it up to three additional "extensions" to it's alotted time. The intention is to continue to catch "shader stuck in a loop" type hangs quickly, but allow more time for things that are actually making forward progress. Because we need to sample the CP state twice to detect if there has not been progress, this also cuts the the timer's duration in half. v2: Fix typo (REG_A6XX_CP_CSQ_IB2_STAT), add comment v3: Only halve hangcheck timer duration for generations which support progress detection (hdanton); removed unused a5xx progress (without knowing how to adjust for data buffered in ROQ it is too likely to report a false negative) v4: Comment updates to better describe the total hangcheck duration when progress detection is applied Reviewed-by: Chia-I Wu <[email protected]> Tested-by: Chia-I Wu <[email protected]> # dEQP-GLES2.functional.flush_finish.wait Signed-off-by: Rob Clark <[email protected]> Reviewed-by: Akhil P Oommen <[email protected]> Patchwork: https://patchwork.freedesktop.org/patch/511584/ Link: https://lore.kernel.org/r/[email protected]
1 parent cade05b commit d73b1d0

File tree

6 files changed

+109
-3
lines changed

6 files changed

+109
-3
lines changed

drivers/gpu/drm/msm/adreno/a6xx_gpu.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
18431843
return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR);
18441844
}
18451845

1846+
static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
1847+
{
1848+
struct msm_cp_state cp_state = {
1849+
.ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE),
1850+
.ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE),
1851+
.ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
1852+
.ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE),
1853+
};
1854+
bool progress;
1855+
1856+
/*
1857+
* Adjust the remaining data to account for what has already been
1858+
* fetched from memory, but not yet consumed by the SQE.
1859+
*
1860+
* This is not *technically* correct, the amount buffered could
1861+
* exceed the IB size due to hw prefetching ahead, but:
1862+
*
1863+
* (1) We aren't trying to find the exact position, just whether
1864+
* progress has been made
1865+
* (2) The CP_REG_TO_MEM at the end of a submit should be enough
1866+
* to prevent prefetching into an unrelated submit. (And
1867+
* either way, at some point the ROQ will be full.)
1868+
*/
1869+
cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
1870+
cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16;
1871+
1872+
progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state));
1873+
1874+
ring->last_cp_state = cp_state;
1875+
1876+
return progress;
1877+
}
1878+
18461879
static u32 a618_get_speed_bin(u32 fuse)
18471880
{
18481881
if (fuse == 0)
@@ -1959,6 +1992,7 @@ static const struct adreno_gpu_funcs funcs = {
19591992
.create_address_space = a6xx_create_address_space,
19601993
.create_private_address_space = a6xx_create_private_address_space,
19611994
.get_rptr = a6xx_get_rptr,
1995+
.progress = a6xx_progress,
19621996
},
19631997
.get_timestamp = a6xx_get_timestamp,
19641998
};

drivers/gpu/drm/msm/msm_drv.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
419419
priv->dev = ddev;
420420

421421
priv->wq = alloc_ordered_workqueue("msm", 0);
422-
priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;
423422

424423
INIT_LIST_HEAD(&priv->objects);
425424
mutex_init(&priv->obj_lock);

drivers/gpu/drm/msm/msm_drv.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,13 @@ struct msm_drm_private {
224224

225225
struct drm_atomic_state *pm_state;
226226

227-
/* For hang detection, in ms */
227+
/**
228+
* hangcheck_period: For hang detection, in ms
229+
*
230+
* Note that in practice, a submit/job will get at least two hangcheck
231+
* periods, due to checking for progress being implemented as simply
232+
* "have the CP position registers changed since last time?"
233+
*/
228234
unsigned int hangcheck_period;
229235

230236
/**

drivers/gpu/drm/msm/msm_gpu.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
492492
round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
493493
}
494494

495+
static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
496+
{
497+
if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
498+
return false;
499+
500+
if (!gpu->funcs->progress)
501+
return false;
502+
503+
if (!gpu->funcs->progress(gpu, ring))
504+
return false;
505+
506+
ring->hangcheck_progress_retries++;
507+
return true;
508+
}
509+
495510
static void hangcheck_handler(struct timer_list *t)
496511
{
497512
struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
@@ -502,9 +517,12 @@ static void hangcheck_handler(struct timer_list *t)
502517
if (fence != ring->hangcheck_fence) {
503518
/* some progress has been made.. ya! */
504519
ring->hangcheck_fence = fence;
505-
} else if (fence_before(fence, ring->fctx->last_fence)) {
520+
ring->hangcheck_progress_retries = 0;
521+
} else if (fence_before(fence, ring->fctx->last_fence) &&
522+
!made_progress(gpu, ring)) {
506523
/* no progress and not done.. hung! */
507524
ring->hangcheck_fence = fence;
525+
ring->hangcheck_progress_retries = 0;
508526
DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
509527
gpu->name, ring->id);
510528
DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n",
@@ -830,6 +848,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
830848
struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
831849
const char *name, struct msm_gpu_config *config)
832850
{
851+
struct msm_drm_private *priv = drm->dev_private;
833852
int i, ret, nr_rings = config->nr_rings;
834853
void *memptrs;
835854
uint64_t memptrs_iova;
@@ -857,6 +876,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
857876
kthread_init_work(&gpu->recover_work, recover_worker);
858877
kthread_init_work(&gpu->fault_work, fault_worker);
859878

879+
priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;
880+
881+
/*
882+
* If progress detection is supported, halve the hangcheck timer
883+
* duration, as it takes two iterations of the hangcheck handler
884+
* to detect a hang.
885+
*/
886+
if (funcs->progress)
887+
priv->hangcheck_period /= 2;
888+
860889
timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
861890

862891
spin_lock_init(&gpu->perf_lock);

drivers/gpu/drm/msm/msm_gpu.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,15 @@ struct msm_gpu_funcs {
7878
struct msm_gem_address_space *(*create_private_address_space)
7979
(struct msm_gpu *gpu);
8080
uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
81+
82+
/**
83+
* progress: Has the GPU made progress?
84+
*
85+
* Return true if GPU position in cmdstream has advanced (or changed)
86+
* since the last call. To avoid false negatives, this should account
87+
* for cmdstream that is buffered in this FIFO upstream of the CP fw.
88+
*/
89+
bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
8190
};
8291

8392
/* Additional state for iommu faults: */
@@ -237,6 +246,7 @@ struct msm_gpu {
237246
#define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */
238247

239248
#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
249+
#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3
240250
struct timer_list hangcheck_timer;
241251

242252
/* Fault info for most recent iova fault: */

drivers/gpu/drm/msm/msm_ringbuffer.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ struct msm_rbmemptrs {
3535
volatile u64 ttbr0;
3636
};
3737

38+
struct msm_cp_state {
39+
uint64_t ib1_base, ib2_base;
40+
uint32_t ib1_rem, ib2_rem;
41+
};
42+
3843
struct msm_ringbuffer {
3944
struct msm_gpu *gpu;
4045
int id;
@@ -64,6 +69,29 @@ struct msm_ringbuffer {
6469
uint64_t memptrs_iova;
6570
struct msm_fence_context *fctx;
6671

72+
/**
73+
* hangcheck_progress_retries:
74+
*
75+
* The number of extra hangcheck duration cycles that we have given
76+
* due to it appearing that the GPU is making forward progress.
77+
*
78+
* For GPU generations which support progress detection (see.
79+
* msm_gpu_funcs::progress()), if the GPU appears to be making progress
80+
* (ie. the CP has advanced in the command stream, we'll allow up to
81+
* DRM_MSM_HANGCHECK_PROGRESS_RETRIES expirations of the hangcheck timer
82+
* before killing the job. But to detect progress we need two sample
83+
* points, so the duration of the hangcheck timer is halved. In other
84+
* words we'll let the submit run for up to:
85+
*
86+
* (DRM_MSM_HANGCHECK_DEFAULT_PERIOD / 2) * (DRM_MSM_HANGCHECK_PROGRESS_RETRIES + 1)
87+
*/
88+
int hangcheck_progress_retries;
89+
90+
/**
91+
* last_cp_state: The state of the CP at the last call to gpu->progress()
92+
*/
93+
struct msm_cp_state last_cp_state;
94+
6795
/*
6896
* preempt_lock protects preemption and serializes wptr updates against
6997
* preemption. Can be aquired from irq context.

0 commit comments

Comments
 (0)