Skip to content

Commit b5cddeb

Browse files
sharleycalzolariKobyElbaz
authored andcommitted
accel/habanalabs/gaudi2: add support for logging register accesses from debugfs
Add infrastructure for logging the last configuration register accesses that occur via debugfs read/write operations. At interrupt time, these log entries can be dumped to dmesg, which helps in diagnosing the cause of RAZWI and ADDR_DEC interrupts. The logging is implemented as a ring buffer of access entries, with each entry recording timestamp and access details. To ensure correctness under concurrent access, operations are now protected using spinlocks. Entries are copied under lock and then printed after releasing it, which minimizes time spent in the critical section. Signed-off-by: Sharley Calzolari <[email protected]> Reviewed-by: Koby Elbaz <[email protected]> Signed-off-by: Koby Elbaz <[email protected]>
1 parent 214e26a commit b5cddeb

File tree

3 files changed

+148
-1
lines changed

3 files changed

+148
-1
lines changed

drivers/accel/habanalabs/common/debugfs.c

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,113 @@ static void hl_access_host_mem(struct hl_device *hdev, u64 addr, u64 *val,
788788
}
789789
}
790790

791+
static void dump_cfg_access_entry(struct hl_device *hdev,
792+
struct hl_debugfs_cfg_access_entry *entry)
793+
{
794+
char *access_type = "";
795+
struct tm tm;
796+
797+
switch (entry->debugfs_type) {
798+
case DEBUGFS_READ32:
799+
access_type = "READ32 from";
800+
break;
801+
case DEBUGFS_WRITE32:
802+
access_type = "WRITE32 to";
803+
break;
804+
case DEBUGFS_READ64:
805+
access_type = "READ64 from";
806+
break;
807+
case DEBUGFS_WRITE64:
808+
access_type = "WRITE64 to";
809+
break;
810+
default:
811+
dev_err(hdev->dev, "Invalid DEBUGFS access type (%u)\n", entry->debugfs_type);
812+
return;
813+
}
814+
815+
time64_to_tm(entry->seconds_since_epoch, 0, &tm);
816+
dev_info(hdev->dev,
817+
"%ld-%02d-%02d %02d:%02d:%02d (UTC): %s %#llx\n", tm.tm_year + 1900, tm.tm_mon + 1,
818+
tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, access_type, entry->addr);
819+
}
820+
821+
void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
822+
{
823+
struct hl_debugfs_cfg_access *dbgfs = &hdev->debugfs_cfg_accesses;
824+
u32 i, head, count = 0;
825+
time64_t entry_time, now;
826+
unsigned long flags;
827+
828+
now = ktime_get_real_seconds();
829+
830+
spin_lock_irqsave(&dbgfs->lock, flags);
831+
head = dbgfs->head;
832+
if (head == 0)
833+
i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
834+
else
835+
i = head - 1;
836+
837+
/* Walk back until timeout or invalid entry */
838+
while (dbgfs->cfg_access_list[i].valid) {
839+
entry_time = dbgfs->cfg_access_list[i].seconds_since_epoch;
840+
/* Stop when entry is older than timeout */
841+
if (now - entry_time > HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC)
842+
break;
843+
844+
/* print single entry under lock */
845+
{
846+
struct hl_debugfs_cfg_access_entry entry = dbgfs->cfg_access_list[i];
847+
/*
848+
* We copy the entry out under lock and then print after
849+
* releasing the lock to minimize time under lock.
850+
*/
851+
spin_unlock_irqrestore(&dbgfs->lock, flags);
852+
dump_cfg_access_entry(hdev, &entry);
853+
spin_lock_irqsave(&dbgfs->lock, flags);
854+
}
855+
856+
/* mark consumed */
857+
dbgfs->cfg_access_list[i].valid = false;
858+
859+
if (i == 0)
860+
i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
861+
else
862+
i--;
863+
count++;
864+
if (count >= HL_DBGFS_CFG_ACCESS_HIST_LEN)
865+
break;
866+
}
867+
spin_unlock_irqrestore(&dbgfs->lock, flags);
868+
}
869+
870+
static void check_if_cfg_access_and_log(struct hl_device *hdev, u64 addr, size_t access_size,
871+
enum debugfs_access_type access_type)
872+
{
873+
struct hl_debugfs_cfg_access *dbgfs_cfg_accesses = &hdev->debugfs_cfg_accesses;
874+
struct pci_mem_region *mem_reg = &hdev->pci_mem_region[PCI_REGION_CFG];
875+
struct hl_debugfs_cfg_access_entry *new_entry;
876+
unsigned long flags;
877+
878+
/* Check if address is in config memory */
879+
if (addr >= mem_reg->region_base &&
880+
mem_reg->region_size >= access_size &&
881+
addr <= mem_reg->region_base + mem_reg->region_size - access_size) {
882+
883+
spin_lock_irqsave(&dbgfs_cfg_accesses->lock, flags);
884+
885+
new_entry = &dbgfs_cfg_accesses->cfg_access_list[dbgfs_cfg_accesses->head];
886+
new_entry->seconds_since_epoch = ktime_get_real_seconds();
887+
new_entry->addr = addr;
888+
new_entry->debugfs_type = access_type;
889+
new_entry->valid = true;
890+
dbgfs_cfg_accesses->head = (dbgfs_cfg_accesses->head + 1)
891+
% HL_DBGFS_CFG_ACCESS_HIST_LEN;
892+
893+
spin_unlock_irqrestore(&dbgfs_cfg_accesses->lock, flags);
894+
895+
}
896+
}
897+
791898
static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
792899
enum debugfs_access_type acc_type)
793900
{
@@ -805,6 +912,7 @@ static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
805912
return rc;
806913
}
807914

915+
check_if_cfg_access_and_log(hdev, addr, acc_size, acc_type);
808916
rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found);
809917
if (rc) {
810918
dev_err(hdev->dev,
@@ -1762,6 +1870,9 @@ int hl_debugfs_device_init(struct hl_device *hdev)
17621870
spin_lock_init(&dev_entry->userptr_spinlock);
17631871
mutex_init(&dev_entry->ctx_mem_hash_mutex);
17641872

1873+
spin_lock_init(&hdev->debugfs_cfg_accesses.lock);
1874+
hdev->debugfs_cfg_accesses.head = 0; /* already zero by alloc but explicit init is fine */
1875+
17651876
return 0;
17661877
}
17671878

drivers/accel/habanalabs/common/habanalabs.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,9 @@ struct hl_fpriv;
9090
#define HL_COMMON_USER_CQ_INTERRUPT_ID 0xFFF
9191
#define HL_COMMON_DEC_INTERRUPT_ID 0xFFE
9292

93-
#define HL_STATE_DUMP_HIST_LEN 5
93+
#define HL_STATE_DUMP_HIST_LEN 5
94+
#define HL_DBGFS_CFG_ACCESS_HIST_LEN 20
95+
#define HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC 2 /* 2s */
9496

9597
/* Default value for device reset trigger , an invalid value */
9698
#define HL_RESET_TRIGGER_DEFAULT 0xFF
@@ -2436,6 +2438,32 @@ struct hl_dbg_device_entry {
24362438
u8 i2c_len;
24372439
};
24382440

2441+
/**
2442+
* struct hl_debugfs_cfg_access_entry - single debugfs config access object, member of
2443+
* hl_debugfs_cfg_access.
2444+
* @seconds_since_epoch: seconds since January 1, 1970, used for time comparisons.
2445+
* @debugfs_type: the debugfs operation requested, can be READ32, WRITE32, READ64 or WRITE64.
2446+
* @addr: the requested address to access.
2447+
* @valid: if set, this entry has valid data for dumping at interrupt time.
2448+
*/
2449+
struct hl_debugfs_cfg_access_entry {
2450+
ktime_t seconds_since_epoch;
2451+
enum debugfs_access_type debugfs_type;
2452+
u64 addr;
2453+
bool valid;
2454+
};
2455+
2456+
/**
2457+
* struct hl_debugfs_cfg_access - saves debugfs config region access requests history.
2458+
* @cfg_access_list: list of objects describing config region access requests.
2459+
* @head: next valid index to add new entry to in cfg_access_list.
2460+
*/
2461+
struct hl_debugfs_cfg_access {
2462+
struct hl_debugfs_cfg_access_entry cfg_access_list[HL_DBGFS_CFG_ACCESS_HIST_LEN];
2463+
u32 head;
2464+
spinlock_t lock; /* protects head and entries */
2465+
};
2466+
24392467
/**
24402468
* struct hl_hw_obj_name_entry - single hw object name, member of
24412469
* hl_state_dump_specs
@@ -3281,6 +3309,7 @@ struct eq_heartbeat_debug_info {
32813309
* @hl_chip_info: ASIC's sensors information.
32823310
* @device_status_description: device status description.
32833311
* @hl_debugfs: device's debugfs manager.
3312+
* @debugfs_cfg_accesses: list of last debugfs config region accesses.
32843313
* @cb_pool: list of pre allocated CBs.
32853314
* @cb_pool_lock: protects the CB pool.
32863315
* @internal_cb_pool_virt_addr: internal command buffer pool virtual address.
@@ -3461,6 +3490,7 @@ struct hl_device {
34613490
struct hwmon_chip_info *hl_chip_info;
34623491

34633492
struct hl_dbg_device_entry hl_debugfs;
3493+
struct hl_debugfs_cfg_access debugfs_cfg_accesses;
34643494

34653495
struct list_head cb_pool;
34663496
spinlock_t cb_pool_lock;
@@ -4110,6 +4140,7 @@ void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
41104140
void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
41114141
void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
41124142
unsigned long length);
4143+
void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev);
41134144

41144145
#else
41154146

@@ -4185,6 +4216,10 @@ static inline void hl_debugfs_set_state_dump(struct hl_device *hdev,
41854216
{
41864217
}
41874218

4219+
static inline void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
4220+
{
4221+
}
4222+
41884223
#endif
41894224

41904225
/* Security */

drivers/accel/habanalabs/gaudi2/gaudi2.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10610,6 +10610,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
1061010610
if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
1061110611
hl_handle_critical_hw_err(hdev, event_type, &event_mask);
1061210612

10613+
hl_debugfs_cfg_access_history_dump(hdev);
1061310614
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
1061410615
hl_device_cond_reset(hdev, reset_flags, event_mask);
1061510616
}

0 commit comments

Comments
 (0)