Skip to content

Commit 02039b1

Browse files
committed
Merge tag 'misc-habanalabs-fixes-2021-01-13' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-linus
Oded writes: This tag contains the following bug fixes: - Fix the dma address that is passed to dma_mmap_coherent. We passed an address that includes an offset that is needed by our device and that caused dma_mmap_coherent to do an errounous mapping. - Fix the reset process in case failures happen during the reset process. Without this fix, if the user would have asked to perform reset after the previous reset failed he would get a kernel panic - WA to prevent soft lockup BUG during unmap of host memory. In case of tens of thousands of mappings, the unmapping can take a long time that exceeds the soft lockup timeout. This WA adds a small sleep every 32K page unmappings to prevent that. * tag 'misc-habanalabs-fixes-2021-01-13' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: habanalabs: prevent soft lockup during unmap habanalabs: fix reset process in case of failures habanalabs: fix dma_addr passed to dma_mmap_coherent
2 parents f970d1d + 9488307 commit 02039b1

File tree

7 files changed

+27
-10
lines changed

7 files changed

+27
-10
lines changed

drivers/misc/habanalabs/common/device.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1037,7 +1037,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
10371037

10381038
if (hard_reset) {
10391039
/* Release kernel context */
1040-
if (hl_ctx_put(hdev->kernel_ctx) == 1)
1040+
if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
10411041
hdev->kernel_ctx = NULL;
10421042
hl_vm_fini(hdev);
10431043
hl_mmu_fini(hdev);

drivers/misc/habanalabs/common/habanalabs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,7 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
21822182
int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
21832183
int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
21842184
struct hl_mmu_hop_info *hops);
2185+
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
21852186

21862187
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
21872188
void __iomem *dst, u32 src_offset, u32 size);

drivers/misc/habanalabs/common/memory.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,8 +886,10 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
886886
{
887887
struct hl_device *hdev = ctx->hdev;
888888
u64 next_vaddr, i;
889+
bool is_host_addr;
889890
u32 page_size;
890891

892+
is_host_addr = !hl_is_dram_va(hdev, vaddr);
891893
page_size = phys_pg_pack->page_size;
892894
next_vaddr = vaddr;
893895

@@ -900,9 +902,13 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
900902
/*
901903
* unmapping on Palladium can be really long, so avoid a CPU
902904
* soft lockup bug by sleeping a little between unmapping pages
905+
*
906+
* In addition, when unmapping host memory we pass through
907+
* the Linux kernel to unpin the pages and that takes a long
908+
* time. Therefore, sleep every 32K pages to avoid soft lockup
903909
*/
904-
if (hdev->pldm)
905-
usleep_range(500, 1000);
910+
if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
911+
usleep_range(50, 200);
906912
}
907913
}
908914

drivers/misc/habanalabs/common/mmu.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
#include "habanalabs.h"
1111

12-
static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
12+
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr)
1313
{
1414
struct asic_fixed_properties *prop = &hdev->asic_prop;
1515

@@ -156,7 +156,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
156156
if (!hdev->mmu_enable)
157157
return 0;
158158

159-
is_dram_addr = is_dram_va(hdev, virt_addr);
159+
is_dram_addr = hl_is_dram_va(hdev, virt_addr);
160160

161161
if (is_dram_addr)
162162
mmu_prop = &prop->dmmu;
@@ -236,7 +236,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
236236
if (!hdev->mmu_enable)
237237
return 0;
238238

239-
is_dram_addr = is_dram_va(hdev, virt_addr);
239+
is_dram_addr = hl_is_dram_va(hdev, virt_addr);
240240

241241
if (is_dram_addr)
242242
mmu_prop = &prop->dmmu;

drivers/misc/habanalabs/common/mmu_v1.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,8 +467,16 @@ static void hl_mmu_v1_fini(struct hl_device *hdev)
467467
{
468468
/* MMU H/W fini was already done in device hw_fini() */
469469

470-
kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
471-
gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
470+
if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.hr.mmu_shadow_hop0)) {
471+
kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
472+
gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
473+
}
474+
475+
/* Make sure that if we arrive here again without init was called we
476+
* won't cause kernel panic. This can happen for example if we fail
477+
* during hard reset code at certain points
478+
*/
479+
hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
472480
}
473481

474482
/**

drivers/misc/habanalabs/gaudi/gaudi.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4002,7 +4002,8 @@ static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
40024002
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
40034003
VM_DONTCOPY | VM_NORESERVE;
40044004

4005-
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
4005+
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr,
4006+
(dma_addr - HOST_PHYS_BASE), size);
40064007
if (rc)
40074008
dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);
40084009

drivers/misc/habanalabs/goya/goya.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2719,7 +2719,8 @@ static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
27192719
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
27202720
VM_DONTCOPY | VM_NORESERVE;
27212721

2722-
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
2722+
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr,
2723+
(dma_addr - HOST_PHYS_BASE), size);
27232724
if (rc)
27242725
dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);
27252726

0 commit comments

Comments
 (0)