Skip to content

Commit c7269b5

Browse files
authored
Merge pull request #48191 from makortel/removeScopedContextTask
Remove `cms::cuda::ScopedContextTask` as unused
2 parents 87d00d7 + ae04529 commit c7269b5

File tree

11 files changed

+13
-374
lines changed

11 files changed

+13
-374
lines changed

HeterogeneousCore/CUDACore/README.md

Lines changed: 0 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -302,85 +302,6 @@ void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetu
302302
[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
303303

304304

305-
### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)
306-
307-
```cpp
308-
class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
309-
public:
310-
...
311-
void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
312-
void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
313-
...
314-
private:
315-
void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder);
316-
317-
...
318-
ProducerInputGPUAlgo gpuAlgo_;
319-
edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
320-
edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
321-
};
322-
...
323-
void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
324-
cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
325-
326-
// Set the current device to the same that was used to produce
327-
// InputData, and also use the same CUDA stream
328-
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
329-
330-
// Grab the real input data. Checks that the input data is on the
331-
// current device. If the input data was produced in a different CUDA
332-
// stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
333-
// synchronization point with CUDA event and cudaStreamWaitEvent()
334-
auto const& inputData = ctx.get(inputDataWrapped);
335-
336-
// Queues asynchronous data transfers and kernels to the CUDA stream
337-
// returned by cms::cuda::ScopedContextAcquire::stream()
338-
gpuAlgo.makeAsync(inputData, ctx.stream());
339-
340-
// Push a functor on top of "a stack of tasks" to be run as a next
341-
// task after the work queued above before produce(). In this case ctx
342-
// is a context constructed by the calling TBB task, and therefore the
343-
// current device and CUDA stream have been already set up. The ctx
344-
// internally holds the WaitingTaskWithArenaHolder for the next task.
345-
346-
ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
347-
addMoreWork(ctx);
348-
});
349-
350-
// Destructor of ctx queues a callback to the CUDA stream notifying
351-
// waitingTaskHolder when the queued asynchronous work has finished,
352-
// and saves the device and CUDA stream to ctxState_
353-
}
354-
355-
// Called after the asynchronous work queued in acquire() has finished
356-
void ProducerInputOutputCUDA::addMoreWork(cms::cuda::ScopedContextTask& ctx) {
357-
// Current device and CUDA stream have already been set
358-
359-
// Queues more asynchronous data transfer and kernels to the CUDA
360-
// stream returned by cms::cuda::ScopedContextTask::stream()
361-
gpuAlgo.makeMoreAsync(ctx.stream());
362-
363-
// Destructor of ctx queues a callback to the CUDA stream notifying
364-
// waitingTaskHolder when the queued asynchronous work has finished
365-
}
366-
367-
// Called after the asynchronous work queued in addMoreWork() has finished
368-
void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
369-
// Sets again the current device, uses the CUDA stream created in the acquire()
370-
cms::cuda::ScopedContextProduce ctx{ctxState_};
371-
372-
// Now getResult() returns data in GPU memory that is passed to the
373-
// constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
374-
// OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
375-
// the current device and the CUDA stream since those will be needed
376-
// in the consumer side.
377-
ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
378-
}
379-
```
380-
381-
[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc)
382-
383-
384305
### Producer with CUDA input and output (without ExternalWork)
385306

386307
If the producer does not need to transfer anything back to CPU (like
@@ -791,79 +712,6 @@ The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
791712
the destructor, and `cms::cuda::ScopedContextProduce` then restores the
792713
context.
793714

794-
#### Module-internal chain of CPU and GPU tasks
795-
796-
Technically `ExternalWork` works such that the framework calls
797-
`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an
798-
`edm::WaitingTask` (that inherits from `tbb::task`) for calling
799-
`produce()` in a `std::shared_ptr` semantics: spawn the task when
800-
reference count hits `0`. It is also possible to create a longer chain
801-
of such tasks, alternating between CPU and GPU work. This mechanism
802-
can also be used to re-run (part of) the GPU work.
803-
804-
The "next tasks" to run are essentially structured as a stack, such
805-
that
806-
- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask::pushNextTask()`
807-
pushes a new functor on top of the stack
808-
- Completion of both the asynchronous work and the queueing function
809-
pops the top task of the stack and enqueues it (so that TBB
810-
eventually runs the task)
811-
* Technically the task is made eligible to run when all copies of
812-
`edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous"
813-
function) have either been destructed or their `doneWaiting()` has
814-
been called
815-
* The code calling `acquire()` or the functor holds one copy of
816-
`edm::WaitingTaskWithArenaHolder` so it is guaranteed that the
817-
next function will not run before the earlier one has finished
818-
819-
820-
Below is an example how to push a functor on top of the stack of tasks
821-
to run next (following the example of the previous section)
822-
```cpp
823-
void FooProducerCUDA::acquire(...) {
824-
...
825-
ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
826-
...
827-
});
828-
...
829-
}
830-
```
831-
832-
In this case the `ctx`argument to the function is a
833-
`cms::cuda::ScopedContexTask` object constructed by the TBB task calling the
834-
user-given function. It follows that the current device and CUDA
835-
stream have been set up already. The `pushNextTask()` can be called
836-
many times. On each invocation the `pushNextTask()` pushes a new task
837-
on top of the stack (i.e. in front of the chain). It follows that in
838-
```cpp
839-
void FooProducerCUDA::acquire(...) {
840-
...
841-
ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
842-
... // function 1
843-
});
844-
ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
845-
... // function 2
846-
});
847-
ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
848-
... // function 3
849-
});
850-
...
851-
}
852-
```
853-
the functions will be run in the order 3, 2, 1.
854-
855-
**Note** that the `CUDAService` is **not** available (nor is any other
856-
service) in these intermediate tasks. In the near future memory
857-
allocations etc. will be made possible by taking them out from the
858-
`CUDAService`.
859-
860-
The `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask` have also a
861-
more generic member function, `replaceWaitingTaskHolder()`, that can
862-
be used to just replace the currently-hold
863-
`edm::WaitingTaskWithArenaHolder` (that will get notified by the
864-
callback function) with anything. In this case the caller is
865-
responsible of creating the task(s) and setting up the chain of them.
866-
867715

868716
#### Transferring GPU data to CPU
869717

HeterogeneousCore/CUDACore/interface/ContextState.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ namespace cms {
2525
private:
2626
friend class ScopedContextAcquire;
2727
friend class ScopedContextProduce;
28-
friend class ScopedContextTask;
2928

3029
void set(int device, SharedStreamPtr stream) {
3130
throwIfStream();

HeterogeneousCore/CUDACore/interface/ScopedContext.h

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -183,36 +183,6 @@ namespace cms {
183183
SharedEventPtr event_ = getEventCache().get();
184184
};
185185

186-
/**
187-
* The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
188-
* - setting the current device
189-
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
190-
* and enforce that those get done in a proper way in RAII fashion.
191-
*/
192-
class ScopedContextTask : public impl::ScopedContextBase {
193-
public:
194-
/// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
195-
explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
196-
: ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards
197-
holderHelper_{std::move(waitingTaskHolder)},
198-
contextState_{state} {}
199-
200-
~ScopedContextTask();
201-
202-
template <typename F>
203-
void pushNextTask(F&& f) {
204-
holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
205-
}
206-
207-
void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
208-
holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
209-
}
210-
211-
private:
212-
impl::ScopedContextHolderHelper holderHelper_;
213-
ContextState const* contextState_;
214-
};
215-
216186
/**
217187
* The aim of this class is to do necessary per-event "initialization" in analyze()
218188
* - setting the current device
@@ -224,19 +194,6 @@ namespace cms {
224194
/// Constructor to (possibly) re-use a CUDA stream
225195
explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
226196
};
227-
228-
namespace impl {
229-
template <typename F>
230-
void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
231-
auto group = waitingTaskHolder_.group();
232-
replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
233-
*group,
234-
edm::make_waiting_task_with_holder(std::move(waitingTaskHolder_),
235-
[state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
236-
func(ScopedContextTask{state, std::move(h)});
237-
})});
238-
}
239-
} // namespace impl
240197
} // namespace cuda
241198
} // namespace cms
242199

HeterogeneousCore/CUDACore/src/ScopedContext.cc

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,4 @@ namespace cms::cuda {
8989
// elsewhere as well.
9090
cudaEventRecord(event_.get(), stream());
9191
}
92-
93-
////////////////////
94-
95-
ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
9692
} // namespace cms::cuda

HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc

Lines changed: 0 additions & 136 deletions
This file was deleted.

HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py

Lines changed: 0 additions & 4 deletions
This file was deleted.

HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py

Lines changed: 0 additions & 4 deletions
This file was deleted.

HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py

Lines changed: 0 additions & 4 deletions
This file was deleted.

0 commit comments

Comments
 (0)