Skip to content

Commit b279b1f

Browse files
committed
Merge tag 'misc-habanalabs-fixes-2020-07-10' of git://people.freedesktop.org/~gabbayo/linux into char-misc-linus
This tag contains the following fixes for 5.8-rc4/5: - Prevent user from using command WREG_BULK in PCI DMA channel. The command won't be parsed correctly by the driver and will cause unknown behavior. As the user doesn't need to use that command in that channel, its better to just prevent it completely. - Change the interface of the clock gating debugfs property from true/false to bitmask with bit per engine. This will allow the user to debug the ASIC while disabling the clock gating feature with fine-grain granularity. - Increase message-to-ASIC-CPU timeout to 4s (from 100ms/1s). The ASIC CPU might respond sometimes after a large delay due to slow external interfaces (such as temperature sensors) and that will result in a driver timeout which will lead to ASIC reset. * tag 'misc-habanalabs-fixes-2020-07-10' of git://people.freedesktop.org/~gabbayo/linux: habanalabs: set 4s timeout for message to device CPU habanalabs: set clock gating per engine habanalabs: block WREG_BULK packet on PDMA
2 parents 3e543a4 + 788cacf commit b279b1f

File tree

10 files changed

+147
-93
lines changed

10 files changed

+147
-93
lines changed

Documentation/ABI/testing/debugfs-driver-habanalabs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,16 @@ Description: Allow the root user to disable/enable in runtime the clock
1616
gating mechanism in Gaudi. Due to how Gaudi is built, the
1717
clock gating needs to be disabled in order to access the
1818
registers of the TPC and MME engines. This is sometimes needed
19-
during debug by the user and hence the user needs this option
19+
during debug by the user and hence the user needs this option.
20+
The user can supply a bitmask value, each bit represents
21+
a different engine to disable/enable its clock gating feature.
22+
The bitmask is composed of 20 bits:
23+
0 - 7 : DMA channels
24+
8 - 11 : MME engines
25+
12 - 19 : TPC engines
26+
The bit's location of a specific engine can be determined
27+
using (1 << GAUDI_ENGINE_ID_*). GAUDI_ENGINE_ID_* values
28+
are defined in uapi habanalabs.h file in enum gaudi_engine_id
2029

2130
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
2231
Date: Jan 2019

drivers/misc/habanalabs/debugfs.c

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
3636
pkt.i2c_reg = i2c_reg;
3737

3838
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
39-
HL_DEVICE_TIMEOUT_USEC, (long *) val);
39+
0, (long *) val);
4040

4141
if (rc)
4242
dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
6363
pkt.value = cpu_to_le64(val);
6464

6565
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
66-
HL_DEVICE_TIMEOUT_USEC, NULL);
66+
0, NULL);
6767

6868
if (rc)
6969
dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
@@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
8787
pkt.value = cpu_to_le64(state);
8888

8989
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
90-
HL_DEVICE_TIMEOUT_USEC, NULL);
90+
0, NULL);
9191

9292
if (rc)
9393
dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
@@ -981,7 +981,7 @@ static ssize_t hl_clk_gate_read(struct file *f, char __user *buf,
981981
if (*ppos)
982982
return 0;
983983

984-
sprintf(tmp_buf, "%d\n", hdev->clock_gating);
984+
sprintf(tmp_buf, "0x%llx\n", hdev->clock_gating_mask);
985985
rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
986986
strlen(tmp_buf) + 1);
987987

@@ -993,7 +993,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
993993
{
994994
struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
995995
struct hl_device *hdev = entry->hdev;
996-
u32 value;
996+
u64 value;
997997
ssize_t rc;
998998

999999
if (atomic_read(&hdev->in_reset)) {
@@ -1002,19 +1002,12 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
10021002
return 0;
10031003
}
10041004

1005-
rc = kstrtouint_from_user(buf, count, 10, &value);
1005+
rc = kstrtoull_from_user(buf, count, 16, &value);
10061006
if (rc)
10071007
return rc;
10081008

1009-
if (value) {
1010-
hdev->clock_gating = 1;
1011-
if (hdev->asic_funcs->enable_clock_gating)
1012-
hdev->asic_funcs->enable_clock_gating(hdev);
1013-
} else {
1014-
if (hdev->asic_funcs->disable_clock_gating)
1015-
hdev->asic_funcs->disable_clock_gating(hdev);
1016-
hdev->clock_gating = 0;
1017-
}
1009+
hdev->clock_gating_mask = value;
1010+
hdev->asic_funcs->set_clock_gating(hdev);
10181011

10191012
return count;
10201013
}

drivers/misc/habanalabs/device.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
608608
hdev->in_debug = 0;
609609

610610
if (!hdev->hard_reset_pending)
611-
hdev->asic_funcs->enable_clock_gating(hdev);
611+
hdev->asic_funcs->set_clock_gating(hdev);
612612

613613
goto out;
614614
}

drivers/misc/habanalabs/firmware_if.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
6161
pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
6262

6363
return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
64-
sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
64+
sizeof(pkt), 0, NULL);
6565
}
6666

6767
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
@@ -144,7 +144,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
144144
pkt.value = cpu_to_le64(event_type);
145145

146146
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
147-
HL_DEVICE_TIMEOUT_USEC, &result);
147+
0, &result);
148148

149149
if (rc)
150150
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
@@ -183,7 +183,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
183183
ARMCP_PKT_CTL_OPCODE_SHIFT);
184184

185185
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
186-
total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
186+
total_pkt_size, 0, &result);
187187

188188
if (rc)
189189
dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -204,7 +204,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
204204
test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
205205

206206
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
207-
sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
207+
sizeof(test_pkt), 0, &result);
208208

209209
if (!rc) {
210210
if (result != ARMCP_PACKET_FENCE_VAL)
@@ -248,7 +248,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
248248
hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
249249

250250
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
251-
sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
251+
sizeof(hb_pkt), 0, &result);
252252

253253
if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
254254
rc = -EIO;

0 commit comments

Comments
 (0)