Skip to content

Commit f42e8e5

Browse files
jchu314atgithubdjbw
authored andcommitted
pmem: implement pmem_recovery_write()
The recovery write thread started out as a normal pwrite thread and when the filesystem was told about potential media error in the range, filesystem turns the normal pwrite to a dax_recovery_write. The recovery write consists of clearing media poison, clearing page HWPoison bit, reenable page-wide read-write permission, flush the caches and finally write. A competing pread thread will be held off during the recovery process since data read back might not be valid, and this is achieved by clearing the badblock records after the recovery write is complete. Competing recovery write threads are already serialized by writer lock held by dax_iomap_rw(). Signed-off-by: Jane Chu <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Link: https://lore.kernel.org/r/165247997655.53156.8381418704988035976.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams <[email protected]>
1 parent 9409c9b commit f42e8e5

File tree

1 file changed

+79
-8
lines changed

1 file changed

+79
-8
lines changed

drivers/nvdimm/pmem.c

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,21 +266,43 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
266266
pfn_t *pfn)
267267
{
268268
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
269-
270-
if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
271-
PFN_PHYS(nr_pages))))
272-
return -EIO;
269+
sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
270+
unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
271+
struct badblocks *bb = &pmem->bb;
272+
sector_t first_bad;
273+
int num_bad;
273274

274275
if (kaddr)
275276
*kaddr = pmem->virt_addr + offset;
276277
if (pfn)
277278
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
278279

280+
if (bb->count &&
281+
badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
282+
long actual_nr;
283+
284+
if (mode != DAX_RECOVERY_WRITE)
285+
return -EIO;
286+
287+
/*
288+
* Set the recovery stride is set to kernel page size because
289+
* the underlying driver and firmware clear poison functions
290+
* don't appear to handle large chunk(such as 2MiB) reliably.
291+
*/
292+
actual_nr = PHYS_PFN(
293+
PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT));
294+
dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n",
295+
sector, nr_pages, first_bad, actual_nr);
296+
if (actual_nr)
297+
return actual_nr;
298+
return 1;
299+
}
300+
279301
/*
280-
* If badblocks are present, limit known good range to the
281-
* requested range.
302+
* If badblocks are present but not in the range, limit known good range
303+
* to the requested range.
282304
*/
283-
if (unlikely(pmem->bb.count))
305+
if (bb->count)
284306
return nr_pages;
285307
return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
286308
}
@@ -310,10 +332,59 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
310332
return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
311333
}
312334

335+
/*
336+
* The recovery write thread started out as a normal pwrite thread and
337+
* when the filesystem was told about potential media error in the
338+
* range, filesystem turns the normal pwrite to a dax_recovery_write.
339+
*
340+
* The recovery write consists of clearing media poison, clearing page
341+
* HWPoison bit, reenable page-wide read-write permission, flush the
342+
* caches and finally write. A competing pread thread will be held
343+
* off during the recovery process since data read back might not be
344+
* valid, and this is achieved by clearing the badblock records after
345+
* the recovery write is complete. Competing recovery write threads
346+
* are already serialized by writer lock held by dax_iomap_rw().
347+
*/
313348
static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
314349
void *addr, size_t bytes, struct iov_iter *i)
315350
{
316-
return 0;
351+
struct pmem_device *pmem = dax_get_private(dax_dev);
352+
size_t olen, len, off;
353+
phys_addr_t pmem_off;
354+
struct device *dev = pmem->bb.dev;
355+
long cleared;
356+
357+
off = offset_in_page(addr);
358+
len = PFN_PHYS(PFN_UP(off + bytes));
359+
if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
360+
return _copy_from_iter_flushcache(addr, bytes, i);
361+
362+
/*
363+
* Not page-aligned range cannot be recovered. This should not
364+
* happen unless something else went wrong.
365+
*/
366+
if (off || !PAGE_ALIGNED(bytes)) {
367+
dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n",
368+
addr, bytes);
369+
return 0;
370+
}
371+
372+
pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
373+
cleared = __pmem_clear_poison(pmem, pmem_off, len);
374+
if (cleared > 0 && cleared < len) {
375+
dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n",
376+
cleared, len);
377+
return 0;
378+
}
379+
if (cleared < 0) {
380+
dev_dbg(dev, "poison clear failed: %ld\n", cleared);
381+
return 0;
382+
}
383+
384+
olen = _copy_from_iter_flushcache(addr, bytes, i);
385+
pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
386+
387+
return olen;
317388
}
318389

319390
static const struct dax_operations pmem_dax_ops = {

0 commit comments

Comments
 (0)