Skip to content

Commit 938a065

Browse files
amd-yahuilalexdeucher
authored andcommitted
drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write v6: sparse fix Signed-off-by: Amber Lin <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 85e7151 commit 938a065

File tree

9 files changed

+293
-1
lines changed

9 files changed

+293
-1
lines changed

drivers/gpu/drm/amd/amdkfd/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
5353
$(AMDKFD_PATH)/kfd_int_process_v9.o \
5454
$(AMDKFD_PATH)/kfd_dbgdev.o \
5555
$(AMDKFD_PATH)/kfd_dbgmgr.o \
56+
$(AMDKFD_PATH)/kfd_smi_events.o \
5657
$(AMDKFD_PATH)/kfd_crat.o
5758

5859
ifneq ($(CONFIG_AMD_IOMMU_V2),)

drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "kfd_events.h"
2525
#include "cik_int.h"
2626
#include "amdgpu_amdkfd.h"
27+
#include "kfd_smi_events.h"
2728

2829
static bool cik_event_interrupt_isr(struct kfd_dev *dev,
2930
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
107108
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
108109
struct kfd_vm_fault_info info;
109110

111+
kfd_smi_event_update_vmfault(dev, pasid);
110112
kfd_process_vm_fault(dev->dqm, pasid);
111113

112114
memset(&info, 0, sizeof(info));

drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "kfd_device_queue_manager.h"
4040
#include "kfd_dbgmgr.h"
4141
#include "amdgpu_amdkfd.h"
42+
#include "kfd_smi_events.h"
4243

4344
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
4445
static int kfd_open(struct inode *, struct file *);
@@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
17401741
return r;
17411742
}
17421743

1744+
/* Handle requests for watching SMI events */
1745+
static int kfd_ioctl_smi_events(struct file *filep,
1746+
struct kfd_process *p, void *data)
1747+
{
1748+
struct kfd_ioctl_smi_events_args *args = data;
1749+
struct kfd_dev *dev;
1750+
1751+
dev = kfd_device_by_id(args->gpuid);
1752+
if (!dev)
1753+
return -EINVAL;
1754+
1755+
return kfd_smi_event_open(dev, &args->anon_fd);
1756+
}
1757+
17431758
#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
17441759
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
17451760
.cmd_drv = 0, .name = #ioctl}
@@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
18351850

18361851
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
18371852
kfd_ioctl_alloc_queue_gws, 0),
1853+
1854+
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
1855+
kfd_ioctl_smi_events, 0),
18381856
};
18391857

18401858
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)

drivers/gpu/drm/amd/amdkfd/kfd_device.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
635635
return ret;
636636
}
637637

638+
static void kfd_smi_init(struct kfd_dev *dev) {
639+
INIT_LIST_HEAD(&dev->smi_clients);
640+
spin_lock_init(&dev->smi_lock);
641+
}
642+
638643
bool kgd2kfd_device_init(struct kfd_dev *kfd,
639644
struct drm_device *ddev,
640645
const struct kgd2kfd_shared_resources *gpu_resources)
@@ -749,6 +754,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
749754
goto kfd_topology_add_device_error;
750755
}
751756

757+
kfd_smi_init(kfd);
758+
752759
kfd->init_complete = true;
753760
dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
754761
kfd->pdev->device);

drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "kfd_events.h"
2525
#include "soc15_int.h"
2626
#include "kfd_device_queue_manager.h"
27+
#include "kfd_smi_events.h"
2728

2829
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
2930
const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
117118
info.prot_read = ring_id & 0x10;
118119
info.prot_write = ring_id & 0x20;
119120

121+
kfd_smi_event_update_vmfault(dev, pasid);
120122
kfd_process_vm_fault(dev->dqm, pasid);
121123
kfd_signal_vm_fault_event(dev, pasid, &info);
122124
}

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ struct kfd_dev {
305305

306306
/* Global GWS resource shared between processes */
307307
void *gws;
308+
309+
/* Clients watching SMI events */
310+
struct list_head smi_clients;
311+
spinlock_t smi_lock;
308312
};
309313

310314
enum kfd_mempool {
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
/*
2+
* Copyright 2020 Advanced Micro Devices, Inc.
3+
*
4+
* Permission is hereby granted, free of charge, to any person obtaining a
5+
* copy of this software and associated documentation files (the "Software"),
6+
* to deal in the Software without restriction, including without limitation
7+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8+
* and/or sell copies of the Software, and to permit persons to whom the
9+
* Software is furnished to do so, subject to the following conditions:
10+
*
11+
* The above copyright notice and this permission notice shall be included in
12+
* all copies or substantial portions of the Software.
13+
*
14+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17+
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18+
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19+
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20+
* OTHER DEALINGS IN THE SOFTWARE.
21+
*/
22+
23+
#include <linux/poll.h>
24+
#include <linux/wait.h>
25+
#include <linux/anon_inodes.h>
26+
#include <uapi/linux/kfd_ioctl.h>
27+
#include "amdgpu_vm.h"
28+
#include "kfd_priv.h"
29+
#include "kfd_smi_events.h"
30+
31+
struct kfd_smi_client {
32+
struct list_head list;
33+
struct kfifo fifo;
34+
wait_queue_head_t wait_queue;
35+
/* events enabled */
36+
uint64_t events;
37+
struct kfd_dev *dev;
38+
spinlock_t lock;
39+
};
40+
41+
#define MAX_KFIFO_SIZE 1024
42+
43+
static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
44+
static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
45+
static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t,
46+
loff_t *);
47+
static int kfd_smi_ev_release(struct inode *, struct file *);
48+
49+
static const char kfd_smi_name[] = "kfd_smi_ev";
50+
51+
static const struct file_operations kfd_smi_ev_fops = {
52+
.owner = THIS_MODULE,
53+
.poll = kfd_smi_ev_poll,
54+
.read = kfd_smi_ev_read,
55+
.write = kfd_smi_ev_write,
56+
.release = kfd_smi_ev_release
57+
};
58+
59+
static __poll_t kfd_smi_ev_poll(struct file *filep,
60+
struct poll_table_struct *wait)
61+
{
62+
struct kfd_smi_client *client = filep->private_data;
63+
__poll_t mask = 0;
64+
65+
poll_wait(filep, &client->wait_queue, wait);
66+
67+
spin_lock(&client->lock);
68+
if (!kfifo_is_empty(&client->fifo))
69+
mask = EPOLLIN | EPOLLRDNORM;
70+
spin_unlock(&client->lock);
71+
72+
return mask;
73+
}
74+
75+
static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
76+
size_t size, loff_t *offset)
77+
{
78+
int ret;
79+
size_t to_copy;
80+
struct kfd_smi_client *client = filep->private_data;
81+
unsigned char buf[MAX_KFIFO_SIZE];
82+
83+
BUILD_BUG_ON(MAX_KFIFO_SIZE > 1024);
84+
85+
/* kfifo_to_user can sleep so we can't use spinlock protection around
86+
* it. Instead, we kfifo out as spinlocked then copy them to the user.
87+
*/
88+
spin_lock(&client->lock);
89+
to_copy = kfifo_len(&client->fifo);
90+
if (!to_copy) {
91+
spin_unlock(&client->lock);
92+
return -EAGAIN;
93+
}
94+
to_copy = min3(size, sizeof(buf), to_copy);
95+
ret = kfifo_out(&client->fifo, buf, to_copy);
96+
spin_unlock(&client->lock);
97+
if (ret <= 0)
98+
return -EAGAIN;
99+
100+
ret = copy_to_user(user, buf, to_copy);
101+
if (ret)
102+
return -EFAULT;
103+
104+
return to_copy;
105+
}
106+
107+
static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user,
108+
size_t size, loff_t *offset)
109+
{
110+
struct kfd_smi_client *client = filep->private_data;
111+
uint64_t events;
112+
113+
if (!access_ok(user, size) || size < sizeof(events))
114+
return -EFAULT;
115+
if (copy_from_user(&events, user, sizeof(events)))
116+
return -EFAULT;
117+
118+
WRITE_ONCE(client->events, events);
119+
120+
return sizeof(events);
121+
}
122+
123+
static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
124+
{
125+
struct kfd_smi_client *client = filep->private_data;
126+
struct kfd_dev *dev = client->dev;
127+
128+
spin_lock(&dev->smi_lock);
129+
list_del_rcu(&client->list);
130+
spin_unlock(&dev->smi_lock);
131+
132+
synchronize_rcu();
133+
kfifo_free(&client->fifo);
134+
kfree(client);
135+
136+
return 0;
137+
}
138+
139+
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
140+
{
141+
struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
142+
struct amdgpu_task_info task_info;
143+
/* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */
144+
/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
145+
*/
146+
char fifo_in[43];
147+
struct kfd_smi_client *client;
148+
int len;
149+
150+
if (list_empty(&dev->smi_clients))
151+
return;
152+
153+
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
154+
amdgpu_vm_get_task_info(adev, pasid, &task_info);
155+
/* Report VM faults from user applications, not retry from kernel */
156+
if (!task_info.pid)
157+
return;
158+
159+
len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
160+
task_info.pid, task_info.task_name);
161+
162+
rcu_read_lock();
163+
164+
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
165+
if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
166+
continue;
167+
spin_lock(&client->lock);
168+
if (kfifo_avail(&client->fifo) >= len) {
169+
kfifo_in(&client->fifo, fifo_in, len);
170+
wake_up_all(&client->wait_queue);
171+
}
172+
else
173+
pr_debug("smi_event(vmfault): no space left\n");
174+
spin_unlock(&client->lock);
175+
}
176+
177+
rcu_read_unlock();
178+
}
179+
180+
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
181+
{
182+
struct kfd_smi_client *client;
183+
int ret;
184+
185+
client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
186+
if (!client)
187+
return -ENOMEM;
188+
INIT_LIST_HEAD(&client->list);
189+
190+
ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
191+
if (ret) {
192+
kfree(client);
193+
return ret;
194+
}
195+
196+
ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
197+
O_RDWR);
198+
if (ret < 0) {
199+
kfifo_free(&client->fifo);
200+
kfree(client);
201+
return ret;
202+
}
203+
*fd = ret;
204+
205+
init_waitqueue_head(&client->wait_queue);
206+
spin_lock_init(&client->lock);
207+
client->events = 0;
208+
client->dev = dev;
209+
210+
spin_lock(&dev->smi_lock);
211+
list_add_rcu(&client->list, &dev->smi_clients);
212+
spin_unlock(&dev->smi_lock);
213+
214+
return 0;
215+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/*
2+
* Copyright 2020 Advanced Micro Devices, Inc.
3+
*
4+
* Permission is hereby granted, free of charge, to any person obtaining a
5+
* copy of this software and associated documentation files (the "Software"),
6+
* to deal in the Software without restriction, including without limitation
7+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8+
* and/or sell copies of the Software, and to permit persons to whom the
9+
* Software is furnished to do so, subject to the following conditions:
10+
*
11+
* The above copyright notice and this permission notice shall be included in
12+
* all copies or substantial portions of the Software.
13+
*
14+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17+
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18+
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19+
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20+
* OTHER DEALINGS IN THE SOFTWARE.
21+
*/
22+
23+
#ifndef KFD_SMI_EVENTS_H_INCLUDED
24+
#define KFD_SMI_EVENTS_H_INCLUDED
25+
26+
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
27+
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
28+
29+
#endif

include/uapi/linux/kfd_ioctl.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,17 @@ struct kfd_ioctl_import_dmabuf_args {
442442
__u32 dmabuf_fd; /* to KFD */
443443
};
444444

445+
/*
446+
* KFD SMI(System Management Interface) events
447+
*/
448+
/* Event type (defined by bitmask) */
449+
#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001
450+
451+
struct kfd_ioctl_smi_events_args {
452+
__u32 gpuid; /* to KFD */
453+
__u32 anon_fd; /* from KFD */
454+
};
455+
445456
/* Register offset inside the remapped mmio page
446457
*/
447458
enum kfd_mmio_remap {
@@ -546,7 +557,10 @@ enum kfd_mmio_remap {
546557
#define AMDKFD_IOC_ALLOC_QUEUE_GWS \
547558
AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
548559

560+
#define AMDKFD_IOC_SMI_EVENTS \
561+
AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
562+
549563
#define AMDKFD_COMMAND_START 0x01
550-
#define AMDKFD_COMMAND_END 0x1F
564+
#define AMDKFD_COMMAND_END 0x20
551565

552566
#endif

0 commit comments

Comments
 (0)