Skip to content

Commit 7f4f4ad

Browse files
committed
Merge tag 'drm-habanalabs-next-2023-06-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains additional habanalabs driver changes for v6.5: - uAPI changes: - Return 0 when user queries if there was a h/w or f/w error and no such error happened. Previously we returned an error in such case. - New features and improvements: - Add pci health check when we lose connection with the firmware. This can be used to distinguish between pci link down and firmware getting stuck. - Add more info to the error print when TPC interrupt occur. - Reduce amount of code under mutex in the command submission of signal event. - Firmware related fixes: - Fixes to the handshake protocol during f/w initialization. - Display information that the f/w sends us when encountering a DMA error. - Do soft-reset using a message sent to firmware instead of writing to MMIO. - Prepare generic code to extract f/w version numbers. - Bug fixes and code cleanups. Notable fixes are: - Unsecure certain TPC registers that the user should access. - Fix handling of QMAN errors - Fix memory leak when recording errors (to later pass them to the user) - Multiple fixes to razwi interrupt handling code Signed-off-by: Dave Airlie <[email protected]> From: Oded Gabbay <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents c9b685d + e6f49e9 commit 7f4f4ad

File tree

23 files changed

+557
-696
lines changed

23 files changed

+557
-696
lines changed

drivers/accel/habanalabs/common/command_buffer.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,6 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
2727
return -EINVAL;
2828
}
2929

30-
if (!hdev->mmu_enable) {
31-
dev_err_ratelimited(hdev->dev,
32-
"Cannot map CB because MMU is disabled\n");
33-
return -EINVAL;
34-
}
35-
3630
if (cb->is_mmu_mapped)
3731
return 0;
3832

drivers/accel/habanalabs/common/command_submission.c

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -280,14 +280,8 @@ bool cs_needs_timeout(struct hl_cs *cs)
280280

281281
static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
282282
{
283-
/*
284-
* Patched CB is created for external queues jobs, and for H/W queues
285-
* jobs if the user CB was allocated by driver and MMU is disabled.
286-
*/
287-
return (job->queue_type == QUEUE_TYPE_EXT ||
288-
(job->queue_type == QUEUE_TYPE_HW &&
289-
job->is_kernel_allocated_cb &&
290-
!hdev->mmu_enable));
283+
/* Patched CB is created for external queues jobs */
284+
return (job->queue_type == QUEUE_TYPE_EXT);
291285
}
292286

293287
/*
@@ -363,14 +357,13 @@ static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
363357
}
364358
}
365359

366-
/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
367-
* enabled, the user CB isn't released in cs_parser() and thus should be
360+
/* For H/W queue jobs, if a user CB was allocated by driver,
361+
* the user CB isn't released in cs_parser() and thus should be
368362
* released here. This is also true for INT queues jobs which were
369363
* allocated by driver.
370364
*/
371-
if ((job->is_kernel_allocated_cb &&
372-
((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
373-
job->queue_type == QUEUE_TYPE_INT))) {
365+
if (job->is_kernel_allocated_cb &&
366+
(job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) {
374367
atomic_dec(&job->user_cb->cs_cnt);
375368
hl_cb_put(job->user_cb);
376369
}
@@ -804,12 +797,14 @@ static void cs_do_release(struct kref *ref)
804797

805798
static void cs_timedout(struct work_struct *work)
806799
{
800+
struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
801+
bool skip_reset_on_timeout, device_reset = false;
807802
struct hl_device *hdev;
808803
u64 event_mask = 0x0;
804+
uint timeout_sec;
809805
int rc;
810-
struct hl_cs *cs = container_of(work, struct hl_cs,
811-
work_tdr.work);
812-
bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
806+
807+
skip_reset_on_timeout = cs->skip_reset_on_timeout;
813808

814809
rc = cs_get_unless_zero(cs);
815810
if (!rc)
@@ -840,29 +835,31 @@ static void cs_timedout(struct work_struct *work)
840835
event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
841836
}
842837

838+
timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
839+
843840
switch (cs->type) {
844841
case CS_TYPE_SIGNAL:
845842
dev_err(hdev->dev,
846-
"Signal command submission %llu has not finished in time!\n",
847-
cs->sequence);
843+
"Signal command submission %llu has not finished in %u seconds!\n",
844+
cs->sequence, timeout_sec);
848845
break;
849846

850847
case CS_TYPE_WAIT:
851848
dev_err(hdev->dev,
852-
"Wait command submission %llu has not finished in time!\n",
853-
cs->sequence);
849+
"Wait command submission %llu has not finished in %u seconds!\n",
850+
cs->sequence, timeout_sec);
854851
break;
855852

856853
case CS_TYPE_COLLECTIVE_WAIT:
857854
dev_err(hdev->dev,
858-
"Collective Wait command submission %llu has not finished in time!\n",
859-
cs->sequence);
855+
"Collective Wait command submission %llu has not finished in %u seconds!\n",
856+
cs->sequence, timeout_sec);
860857
break;
861858

862859
default:
863860
dev_err(hdev->dev,
864-
"Command submission %llu has not finished in time!\n",
865-
cs->sequence);
861+
"Command submission %llu has not finished in %u seconds!\n",
862+
cs->sequence, timeout_sec);
866863
break;
867864
}
868865

@@ -1139,11 +1136,10 @@ static void force_complete_cs(struct hl_device *hdev)
11391136
spin_unlock(&hdev->cs_mirror_lock);
11401137
}
11411138

1142-
void hl_abort_waitings_for_completion(struct hl_device *hdev)
1139+
void hl_abort_waiting_for_cs_completions(struct hl_device *hdev)
11431140
{
11441141
force_complete_cs(hdev);
11451142
force_complete_multi_cs(hdev);
1146-
hl_release_pending_user_interrupts(hdev);
11471143
}
11481144

11491145
static void job_wq_completion(struct work_struct *work)
@@ -1948,8 +1944,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
19481944
else
19491945
cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
19501946

1951-
cb = hl_cb_kernel_create(hdev, cb_size,
1952-
q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
1947+
cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW);
19531948
if (!cb) {
19541949
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
19551950
atomic64_inc(&cntr->out_of_mem_drop_cnt);
@@ -2152,7 +2147,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
21522147

21532148
hdev->asic_funcs->hw_queues_unlock(hdev);
21542149
rc = -EINVAL;
2155-
goto out;
2150+
goto out_unlock;
21562151
}
21572152

21582153
/*
@@ -2167,15 +2162,21 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
21672162

21682163
/* Release the id and free allocated memory of the handle */
21692164
idr_remove(&mgr->handles, handle_id);
2165+
2166+
/* unlock before calling ctx_put, where we might sleep */
2167+
spin_unlock(&mgr->lock);
21702168
hl_ctx_put(encaps_sig_hdl->ctx);
21712169
kfree(encaps_sig_hdl);
2170+
goto out;
21722171
} else {
21732172
rc = -EINVAL;
21742173
dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
21752174
}
2176-
out:
2175+
2176+
out_unlock:
21772177
spin_unlock(&mgr->lock);
21782178

2179+
out:
21792180
return rc;
21802181
}
21812182

drivers/accel/habanalabs/common/debugfs.c

Lines changed: 24 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,6 @@ static int vm_show(struct seq_file *s, void *data)
255255
u64 j;
256256
int i;
257257

258-
if (!dev_entry->hdev->mmu_enable)
259-
return 0;
260-
261258
mutex_lock(&dev_entry->ctx_mem_hash_mutex);
262259

263260
list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) {
@@ -436,9 +433,6 @@ static int mmu_show(struct seq_file *s, void *data)
436433
u64 virt_addr = dev_entry->mmu_addr, phys_addr;
437434
int i;
438435

439-
if (!hdev->mmu_enable)
440-
return 0;
441-
442436
if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
443437
ctx = hdev->kernel_ctx;
444438
else
@@ -496,9 +490,6 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
496490
char *c;
497491
ssize_t rc;
498492

499-
if (!hdev->mmu_enable)
500-
return count;
501-
502493
if (count > sizeof(kbuf) - 1)
503494
goto err;
504495
if (copy_from_user(kbuf, buf, count))
@@ -535,9 +526,6 @@ static int mmu_ack_error(struct seq_file *s, void *data)
535526
struct hl_device *hdev = dev_entry->hdev;
536527
int rc;
537528

538-
if (!hdev->mmu_enable)
539-
return 0;
540-
541529
if (!dev_entry->mmu_cap_mask) {
542530
dev_err(hdev->dev, "mmu_cap_mask is not set\n");
543531
goto err;
@@ -563,9 +551,6 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
563551
char kbuf[MMU_KBUF_SIZE];
564552
ssize_t rc;
565553

566-
if (!hdev->mmu_enable)
567-
return count;
568-
569554
if (count > sizeof(kbuf) - 1)
570555
goto err;
571556

@@ -661,9 +646,6 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
661646
{
662647
struct asic_fixed_properties *prop = &hdev->asic_prop;
663648

664-
if (!hdev->mmu_enable)
665-
goto out;
666-
667649
if (prop->dram_supports_virtual_memory &&
668650
(addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr))
669651
return true;
@@ -675,7 +657,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
675657
if (addr >= prop->pmmu_huge.start_addr &&
676658
addr < prop->pmmu_huge.end_addr)
677659
return true;
678-
out:
660+
679661
return false;
680662
}
681663

@@ -685,9 +667,6 @@ static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr,
685667
struct asic_fixed_properties *prop = &hdev->asic_prop;
686668
u64 dram_start_addr, dram_end_addr;
687669

688-
if (!hdev->mmu_enable)
689-
return false;
690-
691670
if (prop->dram_supports_virtual_memory) {
692671
dram_start_addr = prop->dmmu.start_addr;
693672
dram_end_addr = prop->dmmu.end_addr;
@@ -1756,17 +1735,15 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
17561735
}
17571736
}
17581737

1759-
void hl_debugfs_add_device(struct hl_device *hdev)
1738+
int hl_debugfs_device_init(struct hl_device *hdev)
17601739
{
17611740
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
17621741
int count = ARRAY_SIZE(hl_debugfs_list);
17631742

17641743
dev_entry->hdev = hdev;
1765-
dev_entry->entry_arr = kmalloc_array(count,
1766-
sizeof(struct hl_debugfs_entry),
1767-
GFP_KERNEL);
1744+
dev_entry->entry_arr = kmalloc_array(count, sizeof(struct hl_debugfs_entry), GFP_KERNEL);
17681745
if (!dev_entry->entry_arr)
1769-
return;
1746+
return -ENOMEM;
17701747

17711748
dev_entry->data_dma_blob_desc.size = 0;
17721749
dev_entry->data_dma_blob_desc.data = NULL;
@@ -1787,21 +1764,14 @@ void hl_debugfs_add_device(struct hl_device *hdev)
17871764
spin_lock_init(&dev_entry->userptr_spinlock);
17881765
mutex_init(&dev_entry->ctx_mem_hash_mutex);
17891766

1790-
dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
1791-
hl_debug_root);
1792-
1793-
add_files_to_device(hdev, dev_entry, dev_entry->root);
1794-
if (!hdev->asic_prop.fw_security_enabled)
1795-
add_secured_nodes(dev_entry, dev_entry->root);
1767+
return 0;
17961768
}
17971769

1798-
void hl_debugfs_remove_device(struct hl_device *hdev)
1770+
void hl_debugfs_device_fini(struct hl_device *hdev)
17991771
{
18001772
struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
18011773
int i;
18021774

1803-
debugfs_remove_recursive(entry->root);
1804-
18051775
mutex_destroy(&entry->ctx_mem_hash_mutex);
18061776
mutex_destroy(&entry->file_mutex);
18071777

@@ -1814,6 +1784,24 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
18141784
kfree(entry->entry_arr);
18151785
}
18161786

1787+
void hl_debugfs_add_device(struct hl_device *hdev)
1788+
{
1789+
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
1790+
1791+
dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), hl_debug_root);
1792+
1793+
add_files_to_device(hdev, dev_entry, dev_entry->root);
1794+
if (!hdev->asic_prop.fw_security_enabled)
1795+
add_secured_nodes(dev_entry, dev_entry->root);
1796+
}
1797+
1798+
void hl_debugfs_remove_device(struct hl_device *hdev)
1799+
{
1800+
struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
1801+
1802+
debugfs_remove_recursive(entry->root);
1803+
}
1804+
18171805
void hl_debugfs_add_file(struct hl_fpriv *hpriv)
18181806
{
18191807
struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs;

0 commit comments

Comments
 (0)