diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c16582931..93c1e56905ae4 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -417,12 +417,42 @@ struct MapperComponentsTy { typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t, void *); +/// Structure to store information about a single ATTACH map entry. +struct AttachMapInfo { + void *PointerBase; + void *PointeeBegin; + int64_t PointerSize; + int64_t MapType; + map_var_info_t Pointername; + + AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size, + int64_t Type, map_var_info_t Name) + : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size), + MapType(Type), Pointername(Name) {} +}; + +/// Structure to track ATTACH entries and new allocations across recursive calls +/// (for handling mappers) to targetDataBegin for a given construct. +struct AttachInfoTy { + /// ATTACH map entries for deferred processing. + llvm::SmallVector AttachEntries; + + /// Key: host pointer, Value: allocation size. + llvm::DenseMap NewAllocations; + + AttachInfoTy() = default; + + // Delete copy constructor and copy assignment operator to prevent copying + AttachInfoTy(const AttachInfoTy &) = delete; + AttachInfoTy &operator=(const AttachInfoTy &) = delete; +}; + // Function pointer type for targetData* functions (targetDataBegin, // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, AsyncInfoTy &, - bool); + AttachInfoTy *, bool); void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device, bool toStdOut = false); @@ -431,20 +461,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper = false); + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); +// Process deferred ATTACH map entries collected during targetDataBegin. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo); + struct MappingInfoTy { MappingInfoTy(DeviceTy &Device) : Device(Device) {} diff --git a/offload/include/device.h b/offload/include/device.h index f4b10abbaa3fd..226a9c8902354 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -98,6 +98,11 @@ struct DeviceTy { int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); + // Insert a data fence between previous data operations and the following + // operations if necessary for the device. + int32_t dataFence(AsyncInfoTy &AsyncInfo); + + /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. int32_t notifyDataMapped(void *HstPtr, int64_t Size); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 6971780c7bdb5..9e4bfd2f9cfbe 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -80,6 +80,9 @@ enum tgt_map_type { // the structured region // This is an OpenMP extension for the sake of OpenACC support. OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, + // Attach pointer and pointee, after processing all other maps. + // Applicable to map-entering directives. Does not change ref-count. + OMP_TGT_MAPTYPE_ATTACH = 0x4000, // descriptor for non-contiguous target-update OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, // member of struct, member given by [16 MSBs] - 1 diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30ae9e76b..6585286bf4285 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -191,6 +191,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } +int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) { + return RTL->data_fence(RTLDeviceID, AsyncInfo); +} + int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index ea354400f2e99..1a65262f9dcda 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -165,12 +165,27 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, OMPT_GET_RETURN_ADDRESS);) int Rc = OFFLOAD_SUCCESS; + + // Only allocate AttachInfo for targetDataBegin + AttachInfoTy *AttachInfo = nullptr; + if (TargetDataFunction == targetDataBegin) + AttachInfo = new AttachInfoTy(); + Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo, - false /*FromMapper=*/); + ArgTypes, ArgNames, ArgMappers, AsyncInfo, AttachInfo, + /*FromMapper=*/false); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) { + // Process deferred ATTACH entries BEFORE synchronization + if (AttachInfo && !AttachInfo->AttachEntries.empty()) + Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo); + + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + } + + if (AttachInfo) + delete AttachInfo; handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 5b25d955dd320..b5bbc5a409e85 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, void *ArgMapper, AsyncInfoTy &AsyncInfo, - TargetDataFuncPtrTy TargetDataFunction) { + TargetDataFuncPtrTy TargetDataFunction, + AttachInfoTy *AttachInfo = nullptr) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); // The mapper function fills up Components. @@ -324,17 +325,195 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, - AsyncInfo, /*FromMapper=*/true); + AsyncInfo, AttachInfo, /*FromMapper=*/true); return Rc; } +/// Utility function to perform a pointer attachment operation. +/// +/// For something like: +/// \code +/// int *p; +/// ... +/// #pragma omp target enter data map(to:p[10:10]) +/// \endcode +/// +/// for which the attachment operation gets represented using: +/// \code +/// &p, &p[10], sizeof(p), ATTACH +/// \endcode +/// +/// (Hst|Tgt)PtrAddr represents &p +/// (Hst|Tgt)PteeBase represents &p[0] +/// (Hst|Tgt)PteeBegin represents &p[10] +/// +/// This function first computes the expected TgtPteeBase using: +/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase) +/// +/// and then attaches TgtPteeBase to TgtPtrAddr. +/// +/// \p HstPtrSize represents the size of the pointer p. For C/C++, this +/// should be same as "sizeof(void*)" (say 8). +/// +/// However, for Fortran, pointers/allocatables, which are also eligible for +/// "pointer-attachment", may be implemented using descriptors that contain the +/// address of the pointee in the first 8 bytes, but also contain other +/// information such as lower-bound/upper-bound etc in their subsequent fields. +/// +/// For example, for the following: +/// \code +/// integer, allocatable :: x(:) +/// integer, pointer :: p(:) +/// ... +/// p => x(10: 19) +/// ... +/// !$omp target enter data map(to:p(:)) +/// \endcode +/// +/// The map should trigger a pointer-attachment (assuming the pointer-attachment +/// conditions as noted on processAttachEntries are met) between the descriptor +/// for p, and its pointee data. +/// +/// Since only the first 8 bytes of the descriptor contain the address of the +/// pointee, an attachment operation on device descriptors involves: +/// * Setting the first 8 bytes of the device descriptor to point the device +/// address of the pointee. +/// * Copying the remaining information about bounds/offset etc. from the host +/// descriptor to the device descriptor. +/// +/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps, +/// like: +/// \code +/// &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ +/// \endcoe +/// by using "sizeof(void*)" as \p HstPtrSize. +static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, + void **HstPtrAddr, void *HstPteeBase, + void *HstPteeBegin, void **TgtPtrAddr, + void *TgtPteeBegin, int64_t HstPtrSize, + TargetPointerResultTy &PtrTPR) { + assert(PtrTPR.getEntry() && + "Need a valid pointer entry to perform pointer-attachment"); + + int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small"); + + uint64_t Delta = (uint64_t)HstPteeBegin - (uint64_t)HstPteeBase; + void *TgtPteeBase = (void *)((uint64_t)TgtPteeBegin - Delta); + + // Add shadow pointer tracking + // TODO: Support shadow-tracking of larger than VoidPtrSize pointers, + // to support restoration of Fortran descriptors. Currently, this check + // would return false, even if the host Fortran descriptor was, and we + // should have done an update of the device descriptor. e.g. + // + // !$omp target enter data map(x(1:100)) ! (1) + // p => x(10: 19) + // !$omp target enter data map(p, p(:)) ! (2) + // p => x(5: 9) + // !$omp target enter data map(attach(always): p(:)) ! (3) + // + // While PtrAddr(&desc_p) and PteeBase(&p(1)) are same for (2) and (3), the + // pointer attachment for (3) needs to update the bounds information + // in the descriptor of p on device. + if (!PtrTPR.getEntry()->addShadowPointer( + ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase})) + return OFFLOAD_SUCCESS; + + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr), + DPxPTR(TgtPteeBase)); + + // Lambda to handle submitData result and perform final steps. + auto HandleSubmitResult = [&](int SubmitResult) -> int { + if (SubmitResult != OFFLOAD_SUCCESS) { + REPORT("Failed to update pointer on device.\n"); + return OFFLOAD_FAIL; + } + + if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != + OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + }; + + bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize; + if (!IsPtrAFortranDescriptor) { + // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as + // the buffer space for the submission. + void *&BufferElement = AsyncInfo.getVoidPtrLocation(); + BufferElement = TgtPteeBase; + + // Submit the updated pointer value to device + return HandleSubmitResult(Device.submitData( + TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry())); + } + + // For larger "pointers" (like Fortran's descriptors), we create a dynamic + // buffer, which will be eventually destroyed by AsyncInfo's post-processing + // callback. + char *DataBuffer = new char[HstPtrSize]; + + // For such descriptors, to the first VoidPtrSize bytes, we store the + // pointee's device address. + std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *)); + + // And to the remaining bytes, we copy the remaining contents of the host + // descriptor after the initial VoidPtrSize bytes. + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = (char *)HstPtrAddr + VoidPtrSize; + std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + + DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64 + " additional bytes from host descriptor " DPxMOD ")\n", + HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, + DPxPTR(HstDescriptorFieldsAddr)); + + // Submit the entire buffer to device + // FIXME: When handling ATTACH map-type, pointer attachment needs to happen + // after the other mapping operations are done, to avoid possibility of + // pending transfers clobbering the attachment, for example: + // + // int *p = ...; + // int **pp = &p; + // map(to: pp[0], p[0]) + // + // Which would be represented by: + // &pp[0], &pp[0], sizeof(pp[0]), TO (1) + // &p[0], &p[0], sizeof(p[0]), TO (2) + // + // &pp, &pp[0], sizeof(pp), ATTACH (3) + // &p, &p[0], sizeof(p), ATTACH (4) + // + // (4) and (1) are both trying to modify the device memory corresponding to + // &p. We need to ensure that (4) happens last. + // + // One possible solution to this could be to insert a "device barrier" before + // the first ATTACH submitData call, so that every subsequent submitData waits + // for any prior operations to finish. Like: + // Device.submitData(..., /*InOrder=*/IsFirstAttachEntry) + // Where the boolean InOrder being true means that this submission should + // wait for prior memory submissions to finish. + int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, + AsyncInfo, PtrTPR.getEntry()); + + AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int { + delete[] DataBuffer; + return OFFLOAD_SUCCESS; + }); + return HandleSubmitResult(SubmitResult); +} + /// Internal function to do the mapping and transfer the data to the device int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper) { + AttachInfoTy *AttachInfo, bool FromMapper) { + assert(AttachInfo && "AttachInfo must be available for targetDataBegin for " + "handling ATTACH map-types."); // process each input. for (int32_t I = 0; I < ArgNum; ++I) { // Ignore private variables and arrays - there is no mapping for them. @@ -352,7 +531,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, - targetDataBegin); + targetDataBegin, AttachInfo); if (Rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -369,6 +548,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t DataSize = ArgSizes[I]; map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; + // ATTACH map-types are supposed to be handled after all mapping for the + // construct is done. Defer their processing. + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + AttachInfo->AttachEntries.emplace_back( + /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, + /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], + /*PointeeName=*/HstPtrName); + + DP("Deferring ATTACH map-type processing for argument %d\n", I); + continue; + } + // Adjust for proper alignment if this is a combined entry (for structs). // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. @@ -434,6 +625,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (PointerTpr.Flags.IsNewEntry && !IsHostPtr) + AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *); + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), @@ -464,6 +660,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin) + AttachInfo->NewAllocations[HstPtrBegin] = DataSize; + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD " - is%s new\n", DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); @@ -476,30 +677,12 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { - - uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); - - if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ - (void **)PointerHstPtrBegin, HstPtrBase, - (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { - DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); - - void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); - TgtPtrBase = ExpectedTgtPtrBase; - - int Ret = - Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), - AsyncInfo, PointerTpr.getEntry()); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != - OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + int Ret = performPointerAttachment( + Device, AsyncInfo, (void **)PointerHstPtrBegin, HstPtrBase, + HstPtrBegin, (void **)PointerTgtPtrBegin, TgtPtrBegin, sizeof(void *), + PointerTpr); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; } // Check if variable can be used on the device: @@ -515,6 +698,146 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, return OFFLOAD_SUCCESS; } +/// Process deferred ATTACH map entries collected during targetDataBegin. +/// +/// From OpenMP's perspective, when mapping something that has a base pointer, +/// such as: +/// \code +/// int *p; +/// #pragma omp enter target data map(to: p[10:20]) +/// \endcode +/// +/// a pointer-attachment between p and &p[10] should occur if both p and +/// p[10] are present on the device after doing all allocations for all maps +/// on the construct, and one of the following is true: +/// +/// * The pointer p was newly allocated while handling the construct +/// * The pointee p[10:20] was newly allocated while handling the construct +/// * attach(always) map-type modifier was specified (OpenMP 6.1) +/// +/// That's why we collect all attach entries and new memory allocations during +/// targetDataBegin, and use that information to make the decision of whether +/// to perform a pointer-attachment or not here, after maps have been handled. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo) { + // Report all tracked allocations from both main loop and ATTACH processing + if (!AttachInfo.NewAllocations.empty()) { + DP("Tracked %u total new allocations:\n", + (unsigned)AttachInfo.NewAllocations.size()); + for (const auto &Alloc : AttachInfo.NewAllocations) { + DP(" Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n", + DPxPTR(Alloc.first), Alloc.second); + } + } + + if (AttachInfo.AttachEntries.empty()) + return OFFLOAD_SUCCESS; + + DP("Processing %zu deferred ATTACH map entries\n", + AttachInfo.AttachEntries.size()); + + for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size(); + ++EntryIdx) { + const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx]; + + void **HstPtr = (void **)AttachEntry.PointerBase; + + void *HstPteeBase = *HstPtr; + void *HstPteeBegin = AttachEntry.PointeeBegin; + + int64_t PtrSize = AttachEntry.PointerSize; + int64_t MapType = AttachEntry.MapType; + + DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD + ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n", + EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType); + + const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS; + + // Lambda to check if a pointer was newly allocated + auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) { + bool IsNewlyAllocated = + llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) { + void *AllocPtr = Alloc.first; + int64_t AllocSize = Alloc.second; + return Ptr >= AllocPtr && + Ptr < (void *)((char *)AllocPtr + AllocSize); + }); + DP("ATTACH entry %zu: %s pointer " DPxMOD " was newly allocated: %s\n", + EntryIdx, PtrName, DPxPTR(Ptr), IsNewlyAllocated ? "yes" : "no"); + return IsNewlyAllocated; + }; + + // Only process ATTACH if base/begin was newly allocated OR ALWAYS flag is + // set + if (!IsAttachAlways && !WasNewlyAllocated(HstPtr, "pointer") && + !WasNewlyAllocated(HstPteeBegin, "pointee")) { + DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly " + "allocated and no ALWAYS flag\n", + EntryIdx); + continue; + } + + DP("Processing ATTACH entry %zu: Always=%s\n", EntryIdx, + IsAttachAlways ? "yes" : "no"); + + // Lambda to perform target pointer lookup and validation + auto LookupTargetPointer = + [&](void *Ptr, int64_t Size, + const char *PtrType) -> std::optional { + // ATTACH map-type does not change ref-count, or do any allocation + // We just need to do a lookup for the pointer/pointee. + TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( + Ptr, Size, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false, /*MustContain=*/true); + + DP("ATTACH entry %zu: %s lookup - HstPtr=" DPxMOD ", TgtPtr=" DPxMOD + ", IsPresent=%s, IsHostPtr=%s\n", + EntryIdx, PtrType, DPxPTR(Ptr), DPxPTR(TPR.TargetPointer), + TPR.isPresent() ? "yes" : "no", + TPR.Flags.IsHostPointer ? "yes" : "no"); + + if (!TPR.isPresent()) { + DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx, + PtrType); + return std::nullopt; + } + if (TPR.Flags.IsHostPointer) { + DP("Skipping ATTACH entry %zu: device version of the %s is a host " + "pointer.\n", + EntryIdx, PtrType); + return std::nullopt; + } + + return TPR; + }; + + // Get device version of the pointer (e.g., &p) + auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer"); + if (!PtrTPROpt) + continue; + TargetPointerResultTy &PtrTPR = *PtrTPROpt; + void **TgtPtrBase = (void **)PtrTPR.TargetPointer; + + // Get device version of the pointee (e.g., &p[10]) + auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee"); + if (!PteeTPROpt) + continue; + void *TgtPteeBegin = PteeTPROpt->TargetPointer; + + // Update the device pointer to point to device pointee. + int Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase, + HstPteeBegin, TgtPtrBase, TgtPteeBegin, + PtrSize, PtrTPR); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + DP("ATTACH entry %zu processed successfully\n", EntryIdx); + } + + return OFFLOAD_SUCCESS; +} + namespace { /// This structure contains information to deallocate a target pointer, aka. /// used to fix up the shadow map and potentially delete the entry from the @@ -624,7 +947,8 @@ postProcessingTargetDataEnd(DeviceTy *Device, int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { int Ret = OFFLOAD_SUCCESS; auto *PostProcessingPtrs = new SmallVector(); // process each input. @@ -635,6 +959,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; + // Ignore ATTACH entries - they should only be honored on map-entering + // directives. They may be encountered here while handling the "end" part of + // "#pragma omp target". + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + DP("Ignoring ATTACH entry %d in targetDataEnd\n", I); + continue; + } + if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataEnd, call the // targetDataMapper variant which will call targetDataEnd again @@ -900,7 +1232,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -1213,13 +1546,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + // Create AttachInfo for tracking any ATTACH entries, or new-allocations + // when handling the "begin" mapping for a target constructs. + AttachInfoTy AttachInfo; + int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AsyncInfo, + &AttachInfo, false /*FromMapper=*/); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; } + // Process collected ATTACH entries + if (!AttachInfo.AttachEntries.empty()) { + Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to process ATTACH entries.\n"); + return OFFLOAD_FAIL; + } + } + // List of (first-)private arrays allocated for this target region SmallVector TgtArgsPositions(ArgNum, -1); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index f8db9bf0ae739..d436fa8cc685b 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2538,6 +2538,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 8c17a2ee07047..e4ea79542609d 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -891,6 +891,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Instert a data fence between previous data operations and the following + /// operations if necessary for the device + virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0; + /// Exchange data between devices (device to device transfer). Calling this /// function is only valid if GenericPlugin::isDataExchangable() passing the /// two devices returns true. @@ -1355,6 +1359,10 @@ struct GenericPluginTy { int DstDeviceId, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo); + /// Places a fence between previous data movements and following data movements + /// if necessary on the device + int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo); + /// Begin executing a kernel on the given device. int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 81b9d423e13d8..761068e3f302f 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -2228,3 +2228,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, *KernelPtr = &Kernel; return OFFLOAD_SUCCESS; } + +int32_t GenericPluginTy::data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo ) { + auto Err = getDevice(DeviceId).dataFence(AsyncInfo); + if (Err) { + REPORT("Failure to place data fence on device %d: %s\n", + DeviceId, toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} \ No newline at end of file diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 5a391a4d36006..e396ee8117d0a 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -858,6 +858,10 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// Initialize the device info for interoperability purposes. Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { assert(Context && "Context is null"); diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index d950572265b4c..58b1e69206722 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -295,6 +295,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {