Skip to content

Commit b130440

Browse files
cwabbott0Rob Clark
authored andcommitted
drm/msm: Temporarily disable stall-on-fault after a page fault
When things go wrong, the GPU is capable of quickly generating millions of faulting translation requests per second. When that happens, in the stall-on-fault model each access will stall until it wins the race to signal the fault and then the RESUME register is written. This slows processing page faults to a crawl as the GPU can generate faults much faster than the CPU can acknowledge them. It also means that all available resources in the SMMU are saturated waiting for the stalled transactions, so that other transactions such as transactions generated by the GMU, which shares translation resources with the GPU, cannot proceed. This causes a GMU watchdog timeout, which leads to a failed reset because GX cannot collapse when there is a transaction pending and a permanently hung GPU. On older platforms with qcom,smmu-v2, it seems that when one transaction is stalled subsequent faulting transactions are terminated, which avoids this problem, but the MMU-500 follows the spec here. To work around these problems, disable stall-on-fault as soon as we get a page fault until a cooldown period after pagefaults stop. This allows the GMU some guaranteed time to continue working. We only use stall-on-fault to halt the GPU while we collect a devcoredump and we always terminate the transaction afterward, so it's fine to miss some subsequent page faults. We also keep it disabled so long as the current devcoredump hasn't been deleted, because in that case we likely won't capture another one if there's a fault. After this commit HFI messages still occasionally time out, because the crashdump handler doesn't run fast enough to let the GMU resume, but the driver seems to recover from it. This will probably go away after the HFI timeout is increased. Signed-off-by: Connor Abbott <[email protected]> Reviewed-by: Rob Clark <[email protected]> Patchwork: https://patchwork.freedesktop.org/patch/654891/ Signed-off-by: Rob Clark <[email protected]>
1 parent dedf404 commit b130440

File tree

9 files changed

+116
-1
lines changed

9 files changed

+116
-1
lines changed

drivers/gpu/drm/msm/adreno/a5xx_gpu.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
131131
struct msm_ringbuffer *ring = submit->ring;
132132
unsigned int i, ibs = 0;
133133

134+
adreno_check_and_reenable_stall(adreno_gpu);
135+
134136
if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) {
135137
ring->cur_ctx_seqno = 0;
136138
a5xx_submit_in_rb(gpu, submit);

drivers/gpu/drm/msm/adreno/a6xx_gpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
212212
struct msm_ringbuffer *ring = submit->ring;
213213
unsigned int i, ibs = 0;
214214

215+
adreno_check_and_reenable_stall(adreno_gpu);
216+
215217
a6xx_set_pagetable(a6xx_gpu, ring, submit);
216218

217219
get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0),
@@ -335,6 +337,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
335337
struct msm_ringbuffer *ring = submit->ring;
336338
unsigned int i, ibs = 0;
337339

340+
adreno_check_and_reenable_stall(adreno_gpu);
341+
338342
/*
339343
* Toggle concurrent binning for pagetable switch and set the thread to
340344
* BR since only it can execute the pagetable switch packets.

drivers/gpu/drm/msm/adreno/adreno_gpu.c

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,16 +259,54 @@ u64 adreno_private_address_space_size(struct msm_gpu *gpu)
259259
return BIT(ttbr1_cfg->ias) - ADRENO_VM_START;
260260
}
261261

262+
void adreno_check_and_reenable_stall(struct adreno_gpu *adreno_gpu)
263+
{
264+
struct msm_gpu *gpu = &adreno_gpu->base;
265+
struct msm_drm_private *priv = gpu->dev->dev_private;
266+
unsigned long flags;
267+
268+
/*
269+
* Wait until the cooldown period has passed and we would actually
270+
* collect a crashdump to re-enable stall-on-fault.
271+
*/
272+
spin_lock_irqsave(&priv->fault_stall_lock, flags);
273+
if (!priv->stall_enabled &&
274+
ktime_after(ktime_get(), priv->stall_reenable_time) &&
275+
!READ_ONCE(gpu->crashstate)) {
276+
priv->stall_enabled = true;
277+
278+
gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, true);
279+
}
280+
spin_unlock_irqrestore(&priv->fault_stall_lock, flags);
281+
}
282+
262283
#define ARM_SMMU_FSR_TF BIT(1)
263284
#define ARM_SMMU_FSR_PF BIT(3)
264285
#define ARM_SMMU_FSR_EF BIT(4)
286+
#define ARM_SMMU_FSR_SS BIT(30)
265287

266288
int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
267289
struct adreno_smmu_fault_info *info, const char *block,
268290
u32 scratch[4])
269291
{
292+
struct msm_drm_private *priv = gpu->dev->dev_private;
270293
const char *type = "UNKNOWN";
271-
bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
294+
bool do_devcoredump = info && (info->fsr & ARM_SMMU_FSR_SS) &&
295+
!READ_ONCE(gpu->crashstate);
296+
unsigned long irq_flags;
297+
298+
/*
299+
* In case there is a subsequent storm of pagefaults, disable
300+
* stall-on-fault for at least half a second.
301+
*/
302+
spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);
303+
if (priv->stall_enabled) {
304+
priv->stall_enabled = false;
305+
306+
gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, false);
307+
}
308+
priv->stall_reenable_time = ktime_add_ms(ktime_get(), 500);
309+
spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);
272310

273311
/*
274312
* Print a default message if we couldn't get the data from the

drivers/gpu/drm/msm/adreno/adreno_gpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
636636
struct adreno_smmu_fault_info *info, const char *block,
637637
u32 scratch[4]);
638638

639+
void adreno_check_and_reenable_stall(struct adreno_gpu *gpu);
640+
639641
int adreno_read_speedbin(struct device *dev, u32 *speedbin);
640642

641643
/*

drivers/gpu/drm/msm/msm_debugfs.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,35 @@ DEFINE_DEBUGFS_ATTRIBUTE(shrink_fops,
208208
shrink_get, shrink_set,
209209
"0x%08llx\n");
210210

211+
/*
212+
* Return the number of microseconds to wait until stall-on-fault is
213+
* re-enabled. If 0 then it is already enabled or will be re-enabled on the
214+
* next submit (unless there's a leftover devcoredump). This is useful for
215+
* kernel tests that intentionally produce a fault and check the devcoredump to
216+
* wait until the cooldown period is over.
217+
*/
218+
219+
static int
220+
stall_reenable_time_get(void *data, u64 *val)
221+
{
222+
struct msm_drm_private *priv = data;
223+
unsigned long irq_flags;
224+
225+
spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);
226+
227+
if (priv->stall_enabled)
228+
*val = 0;
229+
else
230+
*val = max(ktime_us_delta(priv->stall_reenable_time, ktime_get()), 0);
231+
232+
spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);
233+
234+
return 0;
235+
}
236+
237+
DEFINE_DEBUGFS_ATTRIBUTE(stall_reenable_time_fops,
238+
stall_reenable_time_get, NULL,
239+
"%lld\n");
211240

212241
static int msm_gem_show(struct seq_file *m, void *arg)
213242
{
@@ -319,6 +348,9 @@ static void msm_debugfs_gpu_init(struct drm_minor *minor)
319348
debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
320349
&priv->disable_err_irq);
321350

351+
debugfs_create_file("stall_reenable_time_us", 0400, minor->debugfs_root,
352+
priv, &stall_reenable_time_fops);
353+
322354
gpu_devfreq = debugfs_create_dir("devfreq", minor->debugfs_root);
323355

324356
debugfs_create_bool("idle_clamp",0600, gpu_devfreq,

drivers/gpu/drm/msm/msm_drv.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
245245
drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock);
246246
drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock);
247247

248+
/* Initialize stall-on-fault */
249+
spin_lock_init(&priv->fault_stall_lock);
250+
priv->stall_enabled = true;
251+
248252
/* Teach lockdep about lock ordering wrt. shrinker: */
249253
fs_reclaim_acquire(GFP_KERNEL);
250254
might_lock(&priv->lru.lock);

drivers/gpu/drm/msm/msm_drv.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,29 @@ struct msm_drm_private {
222222
* the sw hangcheck mechanism.
223223
*/
224224
bool disable_err_irq;
225+
226+
/**
227+
* @fault_stall_lock:
228+
*
229+
* Serialize changes to stall-on-fault state.
230+
*/
231+
spinlock_t fault_stall_lock;
232+
233+
/**
234+
* @fault_stall_reenable_time:
235+
*
236+
* If stall_enabled is false, when to reenable stall-on-fault.
237+
* Protected by @fault_stall_lock.
238+
*/
239+
ktime_t stall_reenable_time;
240+
241+
/**
242+
* @stall_enabled:
243+
*
244+
* Whether stall-on-fault is currently enabled. Protected by
245+
* @fault_stall_lock.
246+
*/
247+
bool stall_enabled;
225248
};
226249

227250
const struct msm_format *mdp_get_format(struct msm_kms *kms, uint32_t format, uint64_t modifier);

drivers/gpu/drm/msm/msm_iommu.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,14 @@ static int msm_disp_fault_handler(struct iommu_domain *domain, struct device *de
372372
return -ENOSYS;
373373
}
374374

375+
static void msm_iommu_set_stall(struct msm_mmu *mmu, bool enable)
376+
{
377+
struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(mmu->dev);
378+
379+
if (adreno_smmu->set_stall)
380+
adreno_smmu->set_stall(adreno_smmu->cookie, enable);
381+
}
382+
375383
static void msm_iommu_detach(struct msm_mmu *mmu)
376384
{
377385
struct msm_iommu *iommu = to_msm_iommu(mmu);
@@ -419,6 +427,7 @@ static const struct msm_mmu_funcs funcs = {
419427
.map = msm_iommu_map,
420428
.unmap = msm_iommu_unmap,
421429
.destroy = msm_iommu_destroy,
430+
.set_stall = msm_iommu_set_stall,
422431
};
423432

424433
struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks)

drivers/gpu/drm/msm/msm_mmu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ struct msm_mmu_funcs {
1515
size_t len, int prot);
1616
int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len);
1717
void (*destroy)(struct msm_mmu *mmu);
18+
void (*set_stall)(struct msm_mmu *mmu, bool enable);
1819
};
1920

2021
enum msm_mmu_type {

0 commit comments

Comments
 (0)