Skip to content

Commit e46738a

Browse files
jokim-amdalexdeucher
authored andcommitted
drm/amdkfd: sever xgmi io link if host driver has disable sharing
Host drivers can create partial hives per guest by disabling xgmi sharing between certain peers in the main hive. Typically, these partial hives are fully connected per guest session. In the event that the host makes a mistake by adding a non-shared node to a guest session, have the KFD reflect sharing disabled by severing the IO link. Signed-off-by: Jonathan Kim <[email protected]> Tested-by: James Yao <[email protected]> Reviewed-by: Harish Kasiviswanathan <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 4618666 commit e46738a

File tree

3 files changed

+22
-0
lines changed

3 files changed

+22
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
801801
return -EINVAL;
802802
}
803803

804+
bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
805+
struct amdgpu_device *peer_adev)
806+
{
807+
struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
808+
int i;
809+
810+
/* Sharing should always be enabled for non-SRIOV. */
811+
if (!amdgpu_sriov_vf(adev))
812+
return true;
813+
814+
for (i = 0 ; i < top->num_nodes; ++i)
815+
if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
816+
return !!top->nodes[i].is_sharing_enabled;
817+
818+
return false;
819+
}
820+
804821
/*
805822
* Devices that support extended data require the entire hive to initialize with
806823
* the shared memory buffer flag set.

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
6666
struct amdgpu_device *peer_adev);
6767
int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
6868
struct amdgpu_device *peer_adev);
69+
bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
70+
struct amdgpu_device *peer_adev);
6971
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
7072
uint64_t addr);
7173
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,

drivers/gpu/drm/amd/amdkfd/kfd_crat.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "kfd_topology.h"
2929
#include "amdgpu.h"
3030
#include "amdgpu_amdkfd.h"
31+
#include "amdgpu_xgmi.h"
3132

3233
/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
3334
* GPU processor ID are expressed with Bit[31]=1.
@@ -2329,6 +2330,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
23292330
continue;
23302331
if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
23312332
continue;
2333+
if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
2334+
continue;
23322335
sub_type_hdr = (typeof(sub_type_hdr))(
23332336
(char *)sub_type_hdr +
23342337
sizeof(struct crat_subtype_iolink));

0 commit comments

Comments
 (0)