Skip to content

Commit 9488307

Browse files
committed
habanalabs: prevent soft lockup during unmap
When using Deep learning framework such as tensorflow or pytorch, there are tens of thousands of host memory mappings. When the user frees all those mappings at the same time, the process of unmapping and unpinning them can take a long time, which may cause a soft lockup bug. To prevent this, we need to free the core to do other things during the unmapping process. For now, we chose to do it every 32K unmappings (each unmap is a single 4K page). Signed-off-by: Oded Gabbay <[email protected]>
1 parent aa6df65 commit 9488307

File tree

3 files changed

+12
-5
lines changed

3 files changed

+12
-5
lines changed

drivers/misc/habanalabs/common/habanalabs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,7 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
21822182
int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
21832183
int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
21842184
struct hl_mmu_hop_info *hops);
2185+
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
21852186

21862187
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
21872188
void __iomem *dst, u32 src_offset, u32 size);

drivers/misc/habanalabs/common/memory.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,8 +886,10 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
886886
{
887887
struct hl_device *hdev = ctx->hdev;
888888
u64 next_vaddr, i;
889+
bool is_host_addr;
889890
u32 page_size;
890891

892+
is_host_addr = !hl_is_dram_va(hdev, vaddr);
891893
page_size = phys_pg_pack->page_size;
892894
next_vaddr = vaddr;
893895

@@ -900,9 +902,13 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
900902
/*
901903
* unmapping on Palladium can be really long, so avoid a CPU
902904
* soft lockup bug by sleeping a little between unmapping pages
905+
*
906+
* In addition, when unmapping host memory we pass through
907+
* the Linux kernel to unpin the pages and that takes a long
908+
* time. Therefore, sleep every 32K pages to avoid soft lockup
903909
*/
904-
if (hdev->pldm)
905-
usleep_range(500, 1000);
910+
if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
911+
usleep_range(50, 200);
906912
}
907913
}
908914

drivers/misc/habanalabs/common/mmu.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
#include "habanalabs.h"
1111

12-
static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
12+
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr)
1313
{
1414
struct asic_fixed_properties *prop = &hdev->asic_prop;
1515

@@ -156,7 +156,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
156156
if (!hdev->mmu_enable)
157157
return 0;
158158

159-
is_dram_addr = is_dram_va(hdev, virt_addr);
159+
is_dram_addr = hl_is_dram_va(hdev, virt_addr);
160160

161161
if (is_dram_addr)
162162
mmu_prop = &prop->dmmu;
@@ -236,7 +236,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
236236
if (!hdev->mmu_enable)
237237
return 0;
238238

239-
is_dram_addr = is_dram_va(hdev, virt_addr);
239+
is_dram_addr = hl_is_dram_va(hdev, virt_addr);
240240

241241
if (is_dram_addr)
242242
mmu_prop = &prop->dmmu;

0 commit comments

Comments
 (0)