Skip to content

Commit 53ee324

Browse files
kevinsalaronlieb
authored andcommitted
[Offload] Add device info for shared memory (llvm#167817)
1 parent 7d7dd41 commit 53ee324

File tree

4 files changed

+28
-8
lines changed

4 files changed

+28
-8
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3453,6 +3453,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
34533453
// TODO: put them back in constructor
34543454
// readEnvVars();
34553455

3456+
// Retrieve the size of the group memory.
3457+
for (const auto *Pool : AllMemoryPools) {
3458+
if (Pool->isGroup()) {
3459+
if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE,
3460+
MaxBlockSharedMemSize))
3461+
return Err;
3462+
break;
3463+
}
3464+
}
3465+
34563466
return Plugin::success();
34573467
}
34583468

@@ -4327,6 +4337,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
43274337
if (Status == HSA_STATUS_SUCCESS)
43284338
Info.add("Cacheline Size", TmpUInt);
43294339

4340+
Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
4341+
DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
4342+
43304343
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
43314344
if (Status == HSA_STATUS_SUCCESS)
43324345
Info.add("Max Clock Freq", TmpUInt, "MHz",

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
869869
/// Get the unique identifier of the device.
870870
const char *getDeviceUid() const { return DeviceUid.c_str(); }
871871

872+
/// Get the total shared memory per block (in bytes) that can be used in any
873+
/// kernel.
874+
size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
875+
872876
/// Set the context of the device if needed, before calling device-specific
873877
/// functions. Plugins may implement this function as a no-op if not needed.
874878
virtual Error setContext() = 0;
@@ -1461,6 +1465,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
14611465
/// Variable to enable kernel duration tracing.
14621466
BoolEnvar OMPX_KernelDurationTracing;
14631467

1468+
/// The total per-block native shared memory that a kernel may use.
1469+
size_t MaxBlockSharedMemSize = 0;
14641470
private:
14651471
/// Return the kernel environment object for kernel \p Name.
14661472
Expected<KernelEnvironmentTy>
@@ -1579,6 +1585,7 @@ struct KernelRunRecordTy {
15791585
std::unordered_map<std::string, TuningMetadataTy> TuningData;
15801586
/// Internal representation for OMPT device (initialize & finalize)
15811587
std::atomic<bool> OmptInitialized;
1588+
15821589
};
15831590

15841591
/// Class implementing common functionalities of offload plugins. Each plugin

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
382382
return Err;
383383
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
384384

385+
uint32_t MaxSharedMem;
386+
if (auto Err = getDeviceAttr(
387+
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
388+
return Err;
389+
MaxBlockSharedMemSize = MaxSharedMem;
390+
385391
return Plugin::success();
386392
}
387393

@@ -1092,10 +1098,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
10921098
if (Res == CUDA_SUCCESS)
10931099
Info.add("Total Constant Memory", TmpInt, "bytes");
10941100

1095-
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
1096-
TmpInt);
1097-
if (Res == CUDA_SUCCESS)
1098-
Info.add("Max Shared Memory per Block", TmpInt, "bytes");
1101+
Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
1102+
DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
10991103

11001104
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
11011105
if (Res == CUDA_SUCCESS)

revert_patches.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
55
breaks build of ROCmValidationSuite
66
[C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
77
---
8-
needs more integration offload.
9-
[Offload] Add device info for shared memory (#167817)
10-
---
118
breaks conformance/2.0/relationals/test_relationals relational_select_signed
129
"DAG: Allow select ptr combine for non-0 address spaces (#167909)"
1310
---
14-

0 commit comments

Comments
 (0)