@@ -92,20 +92,6 @@ asynchronous operations that do not need to wait until the server has completed
9292them. If an operation requires more data than the fixed size buffer, we simply
9393send multiple packets back and forth in a streaming fashion.
9494
95- Server Library
96- --------------
97-
98- The RPC server's basic functionality is provided by the LLVM C library. A static
99- library called ``libllvmlibc_rpc_server.a `` includes handling for the basic
100- operations, such as printing or exiting. This has a small API that handles
101- setting up the unified buffer and an interface to check the opcodes.
102-
103- Some operations are too divergent to provide generic implementations for, such
104- as allocating device accessible memory. For these cases, we provide a callback
105- registration scheme to add a custom handler for any given opcode through the
106- port API. More information can be found in the installed header
107- ``<install>/include/llvmlibc_rpc_server.h ``.
108-
10995Client Example
11096--------------
11197
@@ -183,7 +169,7 @@ CUDA Server Example
183169
184170The following code shows an example of using the exported RPC interface along
185171with the C library to manually configure a working server using the CUDA
186- language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client ``
172+ language. Other runtimes can use the presence of the ``__llvm_rpc_client ``
187173in the GPU executable as an indicator for whether or not the server can be
188174checked. These details should ideally be handled by the GPU language runtime,
189175but the following example shows how it can be used by a standard user.
@@ -196,53 +182,16 @@ but the following example shows how it can be used by a standard user.
196182 #include <cstdlib>
197183 #include <cuda_runtime.h>
198184
199- #include <llvmlibc_rpc_server.h>
185+ #include <shared/rpc.h>
186+ #include <shared/rpc_opcodes.h>
200187
201188 [[noreturn]] void handle_error(cudaError_t err) {
202189 fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
203190 exit(EXIT_FAILURE);
204191 }
205192
206- [[noreturn]] void handle_error(rpc_status_t err) {
207- fprintf(stderr, "RPC error: %d\n", err);
208- exit(EXIT_FAILURE);
209- }
210-
211- // The handle to the RPC client provided by the C library.
212- extern "C" __device__ void *__llvm_libc_rpc_client;
213-
214- __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
215-
216- // Obtain the RPC client's handle from the device. The CUDA language cannot look
217- // up the symbol directly like the driver API, so we launch a kernel to read it.
218- void *get_rpc_client() {
219- void *rpc_client = nullptr;
220- void **rpc_client_d = nullptr;
221-
222- if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
223- handle_error(err);
224- get_client_ptr<<<1, 1>>>(rpc_client_d);
225- if (cudaError_t err = cudaDeviceSynchronize())
226- handle_error(err);
227- if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
228- cudaMemcpyDeviceToHost))
229- handle_error(err);
230- return rpc_client;
231- }
232-
233- // Routines to allocate mapped memory that both the host and the device can
234- // access asychonrously to communicate with each other.
235- void *alloc_host(size_t size, void *) {
236- void *sharable_ptr;
237- if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
238- handle_error(err);
239- return sharable_ptr;
240- };
241-
242- void free_host(void *ptr, void *) {
243- if (cudaError_t err = cudaFreeHost(ptr))
244- handle_error(err);
245- }
193+ // Routes the library symbol into the CUDA runtime interface.
194+ [[gnu::weak]] __device__ rpc::Client client asm("__llvm_rpc_client");
246195
247196 // The device-side overload of the standard C function to call.
248197 extern "C" __device__ int puts(const char *);
@@ -251,18 +200,23 @@ but the following example shows how it can be used by a standard user.
251200 __global__ void hello() { puts("Hello world!"); }
252201
253202 int main() {
254- // Initialize the RPC server to run on the given device.
255- rpc_device_t device;
256- if (rpc_status_t err =
257- rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
258- /*warp_size=*/32, alloc_host, /*data=*/nullptr))
203+ void *rpc_client = nullptr;
204+ if (cudaError_t err = cudaGetSymbolAddress(&rpc_client, client))
205+ handle_error(err);
206+
207+ // Initialize the RPC client and server interface.
208+ uint32_t warp_size = 32;
209+ void *rpc_buffer = nullptr;
210+ if (cudaError_t err = cudaMallocHost(
211+ &rpc_buffer,
212+ rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
259213 handle_error(err);
214+ rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
215+ rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
260216
261- // Initialize the RPC client by copying the buffer to the device's handle.
262- void *rpc_client = get_rpc_client();
263- if (cudaError_t err =
264- cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
265- rpc_get_client_size(), cudaMemcpyHostToDevice))
217+ // Initialize the client on the device so it can communicate with the server.
218+ if (cudaError_t err = cudaMemcpy(rpc_client, &client, sizeof(rpc::Client),
219+ cudaMemcpyHostToDevice))
266220 handle_error(err);
267221
268222 cudaStream_t stream;
@@ -274,28 +228,25 @@ but the following example shows how it can be used by a standard user.
274228
275229 // While the kernel is executing, check the RPC server for work to do.
276230 // Requires non-blocking CUDA kernels but avoids a separate thread.
277- while (cudaStreamQuery(stream) == cudaErrorNotReady)
278- if (rpc_status_t err = rpc_handle_server(device))
279- handle_error(err);
280-
281- // Shut down the server running on the given device.
282- if (rpc_status_t err =
283- rpc_server_shutdown(device, free_host, /*data=*/nullptr))
284- handle_error(err);
285-
286- return EXIT_SUCCESS;
231+ do {
232+ auto port = server.try_open(warp_size, /*index=*/0);
233+ // From libllvmlibc_rpc_server.a in the installation.
234+ if (port)
235+ handle_libc_opcodes(*port, warp_size);
236+ } while (cudaStreamQuery(stream) == cudaErrorNotReady);
287237 }
288238
289239 The above code must be compiled in CUDA's relocatable device code mode and with
290240the advanced offloading driver to link in the library. Currently this can be
291241done with the following invocation. Using LTO avoids the overhead normally
292- associated with relocatable device code linking.
242+ associated with relocatable device code linking. The C library for GPUs is
243+ linked in by forwarding the static library to the device-side link job.
293244
294245.. code-block :: sh
295246
296- $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \
247+ $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart \
297248 -I< install-path> include -L< install-path> /lib -lllvmlibc_rpc_server \
298- -O3 -foffload-lto -o hello
249+ -Xoffload-linker -lc - O3 -foffload-lto -o hello
299250 $> ./hello
300251 Hello world!
301252
@@ -304,4 +255,5 @@ Extensions
304255
305256The opcode is a 32-bit integer that must be unique to the requested operation.
306257All opcodes used by ``libc `` internally have the character ``c `` in the most
307- significant byte.
258+ significant byte. Any other opcode is available for use outside of the ``libc ``
259+ implementation.
0 commit comments