diff --git a/.github/workflows/unit-test-on-pull-request.yml b/.github/workflows/unit-test-on-pull-request.yml
index a5d6b067f..70b5c608f 100644
--- a/.github/workflows/unit-test-on-pull-request.yml
+++ b/.github/workflows/unit-test-on-pull-request.yml
@@ -246,15 +246,27 @@ jobs:
           sudo go test ./interpreter/... -v -run "TestIntegration/(node-local-nightly|node-latest)"
 
   distro-qemu-tests:
-    name: Full distro QEMU tests (kernel ${{ matrix.kernel }})
+    name: Distro QEMU tests (${{ matrix.kernel }} ${{ matrix.target_arch }})
     runs-on: ubuntu-24.04
     timeout-minutes: 15
     strategy:
       matrix:
-        kernel:
-          #- 5.10.217  # 5.10 doesn't have bpf cookies
-          - 5.15.159
-          - 6.8.10    # Post-6.6, supports multi-uprobe
+        include:
+          - { target_arch: amd64, kernel: 5.4.276 }
+          - { target_arch: amd64, kernel: 5.10.217 }
+          - { target_arch: amd64, kernel: 5.15.159 }
+          - { target_arch: amd64, kernel: 6.1.91 }
+          - { target_arch: amd64, kernel: 6.6.31 }
+          - { target_arch: amd64, kernel: 6.8.10 }
+          - { target_arch: amd64, kernel: 6.9.1 }
+          - { target_arch: amd64, kernel: 6.12.16 }
+          - { target_arch: amd64, kernel: 6.16 }
+
+          # ARM64 (NOTE: older ARM64 kernels are not available in Cilium repos)
+          - { target_arch: arm64, kernel: 6.6.31 }
+          - { target_arch: arm64, kernel: 6.8.4 }
+          - { target_arch: arm64, kernel: 6.9.1 }
+          - { target_arch: arm64, kernel: 6.12.16 }
     steps:
       - name: Clone code
         uses: actions/checkout@v4
@@ -263,15 +275,32 @@ jobs:
         with:
           go-version-file: go.mod
           cache-dependency-path: go.sum
+      - name: Set up environment
+        uses: ./.github/workflows/env
       - name: Install dependencies
         run: |
           sudo apt-get update -y
-          sudo apt-get install -y qemu-system-x86 debootstrap systemtap-sdt-dev
+          case "${{ matrix.target_arch }}" in
+            amd64) sudo apt-get -y install qemu-system-x86;;
+            arm64) sudo apt-get -y install qemu-system-arm;;
+            *) echo >&2 "bug: bad arch selected"; exit 1;;
+          esac
+          sudo apt-get install -y debootstrap systemtap-sdt-dev
       - name: Download kernel
         run: |
           cd test/distro-qemu
+          case "${{ matrix.target_arch }}" in
+            amd64) export QEMU_ARCH=x86_64;;
+            arm64) export QEMU_ARCH=aarch64;;
+            *) echo >&2 "bug: bad arch selected"; exit 1;;
+          esac
           ./download-kernel.sh ${{ matrix.kernel }}
-      - name: Run RTLD tests in QEMU
+      - name: Run Full Distro tests in QEMU
         run: |
           cd test/distro-qemu
+          case "${{ matrix.target_arch }}" in
+            amd64) export QEMU_ARCH=x86_64;;
+            arm64) export QEMU_ARCH=aarch64;;
+            *) echo >&2 "bug: bad arch selected"; exit 1;;
+          esac
           ./build-and-run.sh ${{ matrix.kernel }}
diff --git a/interpreter/gpu/cuda.go b/interpreter/gpu/cuda.go
index a23f491ba..dac6301c6 100644
--- a/interpreter/gpu/cuda.go
+++ b/interpreter/gpu/cuda.go
@@ -50,7 +50,6 @@ type data struct {
 type Instance struct {
 	interpreter.InstanceStubs
 	path string
-	link interpreter.LinkCloser
 	pid  libpf.PID
 }
 
@@ -68,9 +67,9 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	if err != nil {
 		return nil, err
 	}
+
 	// We use the existence of the .note.stapsdt section to determine if this is a
-	// process that has libparcagpucupti.so loaded. Its cheaper and more reliable than loading
-	// the symbol table.
+	// process that has libparcagpucupti.so loaded.
 	probes, err := ef.ParseUSDTProbes()
 	if err != nil {
 		return nil, err
@@ -96,7 +95,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	return nil, nil
 }
 
-
 func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Address,
 	_ remotememory.RemoteMemory) (interpreter.Instance, error) {
 	// Maps usdt probe name to ebpf program name.
@@ -115,12 +113,19 @@ func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Addre
 			progNames[i] = "usdt_parcagpu_cuda_kernel"
 		}
 	}
-	lc, err := ebpf.AttachUSDTProbes(pid, d.path, "cuda_probe", d.probes, cookies, progNames, true)
-	if err != nil {
-		return nil, err
+
+	var lc interpreter.LinkCloser
+	if d.link == nil {
+		var err error
+		lc, err = ebpf.AttachUSDTProbes(pid, d.path, "cuda_probe", d.probes, cookies, progNames)
+		if err != nil {
+			return nil, err
+		}
+		log.Debugf("[cuda] parcagpu USDT probes attached for %s", d.path)
+		d.link = lc
+	} else {
+		log.Debugf("[cuda] parcagpu USDT probes already attached for %s", d.path)
 	}
-	log.Debugf("[cuda] parcagpu USDT probes attached for %s", d.path)
-	d.link = lc
 
 	// Create and register fixer for this PID
 	fixer := &gpuTraceFixer{
@@ -129,24 +134,14 @@ func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Addre
 	}
 
 	gpuFixers.Store(pid, fixer)
-
 	return &Instance{
-		link: lc,
 		path: d.path,
 		pid:  pid,
 	}, nil
 }
 
-// Detach removes the fixer for this PID and closes the link if needed.
 func (i *Instance) Detach(_ interpreter.EbpfHandler, _ libpf.PID) error {
 	gpuFixers.Delete(i.pid)
-
-	if i.link != nil {
-		log.Debugf("[cuda] parcagpu USDT probes closed for %s", i.path)
-		if err := i.link.Detach(); err != nil {
-			return err
-		}
-	}
 	return nil
 }
 
diff --git a/interpreter/instancestubs.go b/interpreter/instancestubs.go
index fd89ebd2d..0aa4f7f03 100644
--- a/interpreter/instancestubs.go
+++ b/interpreter/instancestubs.go
@@ -73,7 +73,7 @@ func (m *EbpfHandlerStubs) DeleteProcData(libpf.InterpreterType, libpf.PID) erro
 }
 
 func (mockup *EbpfHandlerStubs) AttachUSDTProbes(libpf.PID, string, string, []pfelf.USDTProbe,
-	[]uint64, []string, bool) (LinkCloser, error) {
+	[]uint64, []string) (LinkCloser, error) {
 	return nil, nil
 }
 
diff --git a/interpreter/rtld/rtld.go b/interpreter/rtld/rtld.go
index c8455c975..308d035e9 100644
--- a/interpreter/rtld/rtld.go
+++ b/interpreter/rtld/rtld.go
@@ -22,7 +22,6 @@ type data struct {
 // instance represents a per-PID instance of the dlopen interpreter
 type instance struct {
 	interpreter.InstanceStubs
-	lc interpreter.LinkCloser
 }
 
 // Loader detects if the ELF file contains the dlopen symbol in its dynamic symbol table
@@ -37,7 +36,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	// Look for the dlopen symbol in the dynamic symbol table
 	sym, err := ef.LookupSymbol("dlopen")
 	if err != nil || sym == nil {
-		// No dlopen symbol found, this library doesn't support dynamic loading
 		return nil, nil
 	}
 
@@ -52,26 +50,21 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 // Attach attaches the uprobe to the dlopen function
 func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, bias libpf.Address,
 	_ remotememory.RemoteMemory) (interpreter.Instance, error) {
-	// Attach uprobe to dlopen using the address stored during Loader
-	lc, err := ebpf.AttachUprobe(pid, d.path, d.address, "uprobe_dlopen")
-	if err != nil {
-		return nil, fmt.Errorf("failed to attach uprobe to dlopen: %w", err)
+	var lc interpreter.LinkCloser
+	if d.lc == nil {
+		// Attach uprobe to dlopen using the address stored during Loader
+		var err error
+		lc, err = ebpf.AttachUprobe(pid, d.path, d.address, "uprobe_dlopen")
+		if err != nil {
+			return nil, fmt.Errorf("failed to attach uprobe to dlopen: %w", err)
+		}
+		d.lc = lc
 	}
 
 	log.Debugf("[dlopen] Attached uprobe to dlopen for PID %d on %s at 0x%x",
 		pid, d.path, d.address)
 
-	d.lc = lc
-	return &instance{lc: lc}, nil
-}
-
-// Detach removes the uprobe
-func (i *instance) Detach(_ interpreter.EbpfHandler, pid libpf.PID) error {
-	log.Debugf("[dlopen] Detach called for PID %d", pid)
-	if i.lc != nil {
-		return i.lc.Detach()
-	}
-	return nil
+	return &instance{}, nil
 }
 
 // Unload cleans up the uprobe link
@@ -80,7 +73,6 @@ func (d *data) Unload(_ interpreter.EbpfHandler) {
 		if err := d.lc.Unload(); err != nil {
 			log.Errorf("[dlopen] Failed to unload uprobe link: %v", err)
 		}
-		d.lc = nil
 	}
 	log.Debugf("[dlopen] Unloaded uprobe for %s", d.path)
 }
diff --git a/interpreter/rtld/rtld_test.go b/interpreter/rtld/rtld_test.go
index 67f576175..d11c46c43 100644
--- a/interpreter/rtld/rtld_test.go
+++ b/interpreter/rtld/rtld_test.go
@@ -1,8 +1,6 @@
 // Copyright The OpenTelemetry Authors
 // SPDX-License-Identifier: Apache-2.0
 
-//go:build amd64 && !integration
-
 package rtld_test
 
 import (
@@ -15,6 +13,7 @@ import (
 	"github.com/coreos/pkg/dlopen"
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/ebpf-profiler/libpf"
 	"go.opentelemetry.io/ebpf-profiler/metrics"
 	"go.opentelemetry.io/ebpf-profiler/support"
 	"go.opentelemetry.io/ebpf-profiler/testutils"
@@ -23,22 +22,32 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/util"
 )
 
-func TestIntegration(t *testing.T) {
+func test(t *testing.T) {
 	if !testutils.IsRoot() {
 		t.Skip("This test requires root privileges")
 	}
 
+	// Enable debug logging for CI debugging
+	if os.Getenv("DEBUG_TEST") != "" {
+		log.SetLevel(log.DebugLevel)
+	}
+
 	// Create a context for the tracer
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
 
 	// Start the tracer with all tracers enabled
 	traceCh, trc := testutils.StartTracer(ctx, t,
-		tracertypes.AllTracers(),
+		tracertypes.IncludedTracers(0),
 		&testutils.MockReporter{},
 		false)
 	defer trc.Close()
 
+	trc.StartPIDEventProcessor(ctx)
+
+	// tickle tihs process to speed things up
+	trc.ForceProcessPID(libpf.PID(uint32(os.Getpid())))
+
 	// Consume traces to prevent blocking
 	go func() {
 		for {
@@ -73,70 +82,20 @@ func TestIntegration(t *testing.T) {
 
 		// Check that the metric was incremented
 		return finalCount > initialCount
-	}, 10*time.Second, 50*time.Millisecond)
+	}, 10*time.Second, 100*time.Millisecond)
 }
 
-func TestIntegrationSingleShot(t *testing.T) {
-	if !testutils.IsRoot() {
-		t.Skip("This test requires root privileges")
-	}
-
-	// Enable debug logging for CI debugging
-	if os.Getenv("DEBUG_TEST") != "" {
-		log.SetLevel(log.DebugLevel)
-	}
+func TestIntegration(t *testing.T) {
+	test(t)
+}
 
-	// Override HasMultiUprobeSupport to force single-shot mode
+func TestIntegrationSingleShot(t *testing.T) {
+	// Override HasMultiUprobeSupport to force single-shot mode on newer kernels.
 	multiUProbeOverride := false
 	util.SetTestOnlyMultiUprobeSupport(&multiUProbeOverride)
 	defer util.SetTestOnlyMultiUprobeSupport(nil)
 
-	// Create a context for the tracer
-	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
-	defer cancel()
-
-	// Start the tracer with all tracers enabled
-	traceCh, trc := testutils.StartTracer(ctx, t,
-		tracertypes.AllTracers(),
-		&testutils.MockReporter{},
-		false)
-	defer trc.Close()
-
-	// Consume traces to prevent blocking
-	go func() {
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-traceCh:
-				// Discard traces
-			}
-		}
-	}()
-
-	// retry a few times to get the metric, our process has to be detected and
-	// the dlopen uprobe has to attach.
-	require.Eventually(t, func() bool {
-		// Get the initial metric value
-		initialCount := getEBPFMetricValue(trc, metrics.IDDlopenUprobeHits)
-		//t.Logf("Initial dlopen uprobe metric count: %d", initialCount)
-
-		// Use dlopen to load a shared library
-		// libm is a standard math library that's always present
-		lib, err := dlopen.GetHandle([]string{
-			"/lib/x86_64-linux-gnu/libm.so.6",
-			"libm.so.6",
-		})
-		require.NoError(t, err, "Failed to open libm.so.6")
-		defer lib.Close()
-
-		// Get the metrics after dlopen
-		finalCount := getEBPFMetricValue(trc, metrics.IDDlopenUprobeHits)
-		//t.Logf("Final dlopen uprobe metric count: %d", finalCount)
-
-		// Check that the metric was incremented
-		return finalCount > initialCount
-	}, 10*time.Second, 50*time.Millisecond)
+	test(t)
 }
 
 func getEBPFMetricValue(trc *tracer.Tracer, metricID metrics.MetricID) uint64 {
diff --git a/interpreter/types.go b/interpreter/types.go
index 0a4e77687..ef1698169 100644
--- a/interpreter/types.go
+++ b/interpreter/types.go
@@ -117,17 +117,13 @@ type EbpfHandler interface {
 	// AttachUSDTProbes attaches an eBPF program to USDT probes in the specified binary.
 	//
 	// Parameters:
-	//  - pid: The process ID. Required for older kernels (pre-6.6) that cannot attach to shared
-	//    libraries without a PID. On newer kernels with multi-uprobe support, this is ignored
-	//    when probeAll is true.
+	//  - pid: The process ID. Required for getting path to exe via procfs.
 	//  - path: Full path to the binary containing the USDT probes.
 	//  - multiProgName: Name of eBPF program to use for multi-uprobe attachment (newer kernels).
 	//  - probes: The USDT probe definitions to attach to.
 	//  - cookies: Optional cookies to pass to the eBPF program (one per probe, or nil).
 	//  - singleProgNames: eBPF program names for single-shot attachment (older kernels, one
 	//    per probe).
-	//  - probeAll: If true and the kernel supports it, attach to all processes using this
-	//    binary. If false, only attach to the specified pid.
 	//
 	// Returns:
 	//  - LinkCloser: A handle to the attached probes. The caller must:
@@ -136,14 +132,13 @@ type EbpfHandler interface {
 	//    2. Call LinkCloser.Detach() from Instance.Detach() to detach from the specific PID
 	//    3. Call LinkCloser.Unload() from Data.Unload() to fully clean up the eBPF program
 	AttachUSDTProbes(pid libpf.PID, path, multiProgName string, probes []pfelf.USDTProbe,
-		cookies []uint64, singleProgNames []string, probeAll bool) (LinkCloser, error)
+		cookies []uint64, singleProgNames []string) (LinkCloser, error)
 
 	// AttachUprobe attaches an eBPF uprobe to a function at a specific offset in a binary
 	AttachUprobe(pid libpf.PID, path string, offset uint64, progName string) (LinkCloser, error)
 }
 
 type LinkCloser interface {
-	Detach() error
 	Unload() error
 }
 
diff --git a/processmanager/ebpf/ebpf.go b/processmanager/ebpf/ebpf.go
index 1154e6d2e..49da323b1 100644
--- a/processmanager/ebpf/ebpf.go
+++ b/processmanager/ebpf/ebpf.go
@@ -143,8 +143,9 @@ func LoadMaps(ctx context.Context, maps map[string]*cebpf.Map,
 }
 
 type linkCloser struct {
-	detachLink []link.Link
-	unloadLink link.Link
+	unloadLink    []link.Link
+	unloadSpecIDs []uint32   // spec IDs to delete when unload happens
+	specMap       *cebpf.Map // reference to the spec map for cleanup
 }
 
 // populateUSDTSpecMaps parses USDT probe arguments and populates the BPF spec maps.
@@ -176,7 +177,7 @@ func populateUSDTSpecMaps(probes []pfelf.USDTProbe, specMap *cebpf.Map, startSpe
 		specIDs[i] = specID
 
 		// Store the spec in the map
-		if err := specMap.Put(&specID, pfelf.USDTSpecToBytes(spec)); err != nil {
+		if err := specMap.Put(unsafe.Pointer(&specID), pfelf.USDTSpecToBytes(spec)); err != nil {
 			return nil, fmt.Errorf("failed to store USDT spec for %s:%s: %w",
 				probe.Provider, probe.Name, err)
 		}
@@ -185,34 +186,40 @@ func populateUSDTSpecMaps(probes []pfelf.USDTProbe, specMap *cebpf.Map, startSpe
 	return specIDs, nil
 }
 
-func (lc *linkCloser) Detach() error {
+func (lc *linkCloser) Unload() error {
 	var errs []error
-	if lc.detachLink != nil {
-		for _, l := range lc.detachLink {
+	if lc.unloadLink != nil {
+		for _, l := range lc.unloadLink {
 			if err := l.Close(); err != nil {
 				errs = append(errs, err)
 			}
 		}
 	}
-	return errors.Join(errs...)
-}
-
-func (lc *linkCloser) Unload() error {
-	if lc.unloadLink != nil {
-		return lc.unloadLink.Close()
+	// Clean up spec IDs associated with unload
+	if lc.specMap != nil && len(lc.unloadSpecIDs) > 0 {
+		for _, specID := range lc.unloadSpecIDs {
+			if specID != 0 {
+				if err := lc.specMap.Delete(unsafe.Pointer(&specID)); err != nil {
+					log.Warnf("Failed to delete spec ID %d from map: %v", specID, err)
+					errs = append(errs, err)
+				} else {
+					log.Debugf("Deleted spec ID %d from map during unload", specID)
+				}
+			}
+		}
 	}
-	return nil
+	return errors.Join(errs...)
 }
 
 // AttachUSDTProbes allows interpreters to attach to usdt probes.
 func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName string,
-	probes []pfelf.USDTProbe, cookies []uint64, singleProgNames []string,
-	probeAll bool) (interpreter.LinkCloser, error) {
-	containerPath := fmt.Sprintf("/proc/%d/root/%s", pid, path)
+	probes []pfelf.USDTProbe, cookies []uint64, singleProgNames []string) (interpreter.LinkCloser, error) {
+	useMulti := util.HasMultiUprobeSupport()
+	if !useMulti && len(probes) > 1 && multiProgName != "" && len(singleProgNames) == 0 {
+		return nil, errors.New("uprobe multi attach requires kernel support (kernel 6.6+)")
+	}
 
-	// TODO: This will crack open the exe with debug.elf and read symbols, we should
-	// contribute a PR to cilium to allow it to delegate to pfelf instead. This will
-	// also allow us to avoid the proc/pid/root stuff.
+	containerPath := fmt.Sprintf("/proc/%d/root/%s", pid, path)
 	exe, err := link.OpenExecutable(containerPath)
 	if err != nil {
 		// The upstack code will swallow file not found errors so drop a crumb.
@@ -226,30 +233,19 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 
 	// Parse USDT arguments and populate spec maps using the helper
 	var specIDs []uint32
-	if impl.usdtSpecsMap != nil {
-		// Get the starting spec ID and update nextSpecID under lock
-		startSpecID := func() uint32 {
-			impl.specIDLock.Lock()
-			defer impl.specIDLock.Unlock()
-			specID := impl.nextSpecID
-			impl.nextSpecID += uint32(len(probes))
-			return specID
-		}()
-
-		// Populate USDT spec maps directly
-		var err error
-		specIDs, err = populateUSDTSpecMaps(probes, impl.usdtSpecsMap, startSpecID)
-		if err != nil {
-			return nil, fmt.Errorf("failed to populate USDT spec maps: %w", err)
-		}
+	// Get the starting spec ID and update nextSpecID under lock
+	startSpecID := func() uint32 {
+		impl.specIDLock.Lock()
+		defer impl.specIDLock.Unlock()
+		specID := impl.nextSpecID
+		impl.nextSpecID += uint32(len(probes))
+		return specID
+	}()
 
-		// Log successful spec population
-		for i, specID := range specIDs {
-			if specID != 0 {
-				log.Debugf("Stored USDT spec %d for %s:%s",
-					specID, probes[i].Provider, probes[i].Name)
-			}
-		}
+	// Populate USDT spec maps directly
+	specIDs, err = populateUSDTSpecMaps(probes, impl.usdtSpecsMap, startSpecID)
+	if err != nil {
+		return nil, fmt.Errorf("failed to populate USDT spec maps: %w", err)
 	}
 
 	names := make([]string, 0, len(probes))
@@ -284,8 +280,6 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 		}
 	}
 
-	useMulti := util.HasMultiUprobeSupport()
-
 	// If multiProgName is empty or multi-probe not supported, use individual programs (one per probe)
 	if multiProgName == "" || !useMulti {
 		if singleProgNames == nil {
@@ -326,7 +320,6 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 			uprobeOpts := &link.UprobeOptions{
 				Address:      probe.Location,
 				RefCtrOffset: probe.SemaphoreOffset,
-				PID:          int(pid),
 			}
 
 			// Set cookie if provided
@@ -348,7 +341,11 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 		}
 
 		log.Infof("Attached %d individual probes to %s in PID %d", len(links), path, pid)
-		return &linkCloser{detachLink: links}, nil
+		return &linkCloser{
+			unloadLink:    links,
+			unloadSpecIDs: specIDs,
+			specMap:       impl.usdtSpecsMap,
+		}, nil
 	}
 
 	prog := impl.userProgs[multiProgName]
@@ -359,47 +356,10 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 		prog = impl.userProgs[multiProgName]
 	}
 
-	if !useMulti && len(probes) > 1 {
-		return nil, errors.New("attaching multiple probes requires multi support (kernel 6.6+)")
-	}
-
-	// Determine PID for attachment
-	attachPID := int(pid)
-	if probeAll {
-		attachPID = 0 // 0 means all processes
-	}
-
-	// Single probe with single program - use single uprobe
-	if len(probes) == 1 {
-		uprobeOpts := &link.UprobeOptions{
-			Address:      probes[0].Location,
-			RefCtrOffset: probes[0].SemaphoreOffset,
-			PID:          attachPID,
-		}
-		if finalCookies != nil && len(finalCookies) > 0 {
-			uprobeOpts.Cookie = finalCookies[0]
-		}
-
-		l, err := exe.Uprobe(probes[0].Name, prog, uprobeOpts)
-		if err != nil {
-			return nil, fmt.Errorf("failed to attach USDT probe %s at location 0x%x: %w",
-				probes[0].Name, probes[0].Location, err)
-		}
-		log.Infof("Attached probe %s to usdt %s in PID %d", multiProgName, path, pid)
-		return &linkCloser{unloadLink: l}, nil
-	}
-
-	// Multiple probes - use UprobeMulti
-	var probePid uint32
-	if attachPID != 0 {
-		probePid = uint32(attachPID)
-	}
-
 	lnk, err := exe.UprobeMulti(names, prog, &link.UprobeMultiOptions{
 		Addresses:     addresses,
 		RefCtrOffsets: offsets,
 		Cookies:       finalCookies,
-		PID:           probePid,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to attach USDT probes with UprobeMulti to %s: %s %w",
@@ -407,7 +367,11 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 	}
 
 	log.Infof("Attached probe %s to usdt %s in PID %d", multiProgName, path, pid)
-	return &linkCloser{unloadLink: lnk}, nil
+	return &linkCloser{
+		unloadLink:    []link.Link{lnk},
+		unloadSpecIDs: specIDs,
+		specMap:       impl.usdtSpecsMap,
+	}, nil
 }
 
 // loadProgram loads an eBPF program from progSpec and populates the related maps.
@@ -423,7 +387,6 @@ func (impl *ebpfMapsImpl) loadUSDTProgram(progName string, useMulti bool) error
 	defer restoreRlimit()
 
 	if useMulti {
-		log.Infof("Loading USDT multi-probe program %s", progName)
 		progSpec.AttachType = cebpf.AttachTraceUprobeMulti
 	}
 
@@ -487,15 +450,13 @@ func (impl *ebpfMapsImpl) AttachUprobe(pid libpf.PID, path string, offset uint64
 	// Attach the uprobe
 	lnk, err := exe.Uprobe("", prog, &link.UprobeOptions{
 		Address: offset,
-		PID:     int(pid),
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to attach uprobe to %s at offset 0x%x: %w",
 			path, offset, err)
 	}
-
 	log.Infof("Attached uprobe %s to %s at offset 0x%x in PID %d", progName, path, offset, pid)
-	return &linkCloser{detachLink: []link.Link{lnk}}, nil
+	return &linkCloser{unloadLink: []link.Link{lnk}}, nil
 }
 
 func (impl *ebpfMapsImpl) CoredumpTest() bool {
diff --git a/processmanager/execinfomanager/manager.go b/processmanager/execinfomanager/manager.go
index 4cc4a3169..ec7681ff5 100644
--- a/processmanager/execinfomanager/manager.go
+++ b/processmanager/execinfomanager/manager.go
@@ -143,7 +143,12 @@ func NewExecutableInfoManager(
 	interpreterLoaders = append(interpreterLoaders, oomwatcher.Loader, rtld.Loader)
 
 	if includeTracers.Has(types.CUDATracer) {
-		interpreterLoaders = append(interpreterLoaders, gpu.Loader)
+		// USDT support requires cookies
+		if util.HasBpfGetAttachCookie() {
+			interpreterLoaders = append(interpreterLoaders, gpu.Loader)
+		} else {
+			log.Warn("CUDA USDT tracing is not supported on this kernel (missing bpf_get_attach_cookie)")
+		}
 	}
 
 	deferredFileIDs, err := lru.NewSynced[host.FileID, libpf.Void](deferredFileIDSize,
diff --git a/support/ebpf/tracer.ebpf.amd64 b/support/ebpf/tracer.ebpf.amd64
index 9a1b94512..1422c095f 100644
Binary files a/support/ebpf/tracer.ebpf.amd64 and b/support/ebpf/tracer.ebpf.amd64 differ
diff --git a/support/ebpf/tracer.ebpf.arm64 b/support/ebpf/tracer.ebpf.arm64
index bfabc6860..b0911e1a1 100644
Binary files a/support/ebpf/tracer.ebpf.arm64 and b/support/ebpf/tracer.ebpf.arm64 differ
diff --git a/support/ebpf/usdt.ebpf.c b/support/ebpf/usdt.ebpf.c
index 47bf928a6..616161a6a 100644
--- a/support/ebpf/usdt.ebpf.c
+++ b/support/ebpf/usdt.ebpf.c
@@ -8,13 +8,9 @@
   #define BPF_USDT_MAX_SPEC_CNT 256
 #endif
 
-#ifndef BPF_USDT_MAX_IP_CNT
-  #define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT)
-#endif
-
 // USDT specification maps (libbpf-compatible)
 bpf_map_def SEC("maps") __bpf_usdt_specs = {
-  .type        = BPF_MAP_TYPE_ARRAY,
+  .type        = BPF_MAP_TYPE_HASH,
   .key_size    = sizeof(u32),
   .value_size  = sizeof(struct bpf_usdt_spec),
   .max_entries = BPF_USDT_MAX_SPEC_CNT,
diff --git a/support/usdt/test/usdt_integration_test.go b/support/usdt/test/usdt_integration_test.go
index 72792800e..7fb87c0d7 100644
--- a/support/usdt/test/usdt_integration_test.go
+++ b/support/usdt/test/usdt_integration_test.go
@@ -21,6 +21,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/reporter"
 	"go.opentelemetry.io/ebpf-profiler/tracer"
 	tracertypes "go.opentelemetry.io/ebpf-profiler/tracer/types"
+	"go.opentelemetry.io/ebpf-profiler/util"
 )
 
 type mockIntervals struct{}
@@ -36,15 +37,15 @@ func (mockReporter) ExecutableMetadata(_ *reporter.ExecutableMetadataArgs) {}
 
 // testSetup encapsulates all the common test setup
 type testSetup struct {
-	t            *testing.T
-	testBinary   string
-	testProbes   map[string]pfelf.USDTProbe
-	probeList    []pfelf.USDTProbe
-	tracer       *tracer.Tracer
-	ebpfHandler  interpreter.EbpfHandler
-	resultsMap   *cebpf.Map
-	ctx          context.Context
-	cancelFunc   context.CancelFunc
+	t           *testing.T
+	testBinary  string
+	testProbes  map[string]pfelf.USDTProbe
+	probeList   []pfelf.USDTProbe
+	tracer      *tracer.Tracer
+	ebpfHandler interpreter.EbpfHandler
+	resultsMap  *cebpf.Map
+	ctx         context.Context
+	cancelFunc  context.CancelFunc
 }
 
 // setupTest performs all common initialization for USDT integration tests
@@ -53,6 +54,10 @@ func setupTest(t *testing.T) *testSetup {
 		t.Skip("This test requires root privileges to load eBPF programs")
 	}
 
+	if !util.HasBpfGetAttachCookie() {
+		t.Skip("This test requires kernel support for bpf_get_attach_cookie")
+	}
+
 	// Get the test binary path
 	testBinary, err := os.Executable()
 	if err != nil {
@@ -218,14 +223,14 @@ func TestUSDTProbeWithEBPFSingle(t *testing.T) {
 
 	// Individual program names for each probe
 	progNames := []string{
-		"usdt_simple_probe",
-		"usdt_memory_probe",
-		"usdt_const_probe",
-		"usdt_mixed_probe",
-		"usdt_int32_args",
-		"usdt_int64_args",
-		"usdt_mixed_refs",
-		"usdt_uint8_args",
+		"simple_probe",
+		"memory_probe",
+		"const_probe",
+		"mixed_probe",
+		"int32_args",
+		"int64_args",
+		"mixed_refs",
+		"uint8_args",
 	}
 
 	// Attach USDT probes with individual programs
@@ -237,12 +242,11 @@ func TestUSDTProbeWithEBPFSingle(t *testing.T) {
 		setup.probeList,
 		nil, // no user cookies, just spec IDs
 		progNames,
-		false, // attach to current PID only
 	)
 	if err != nil {
 		t.Fatalf("failed to attach USDT probes: %v", err)
 	}
-	defer lc.Detach()
+	defer lc.Unload()
 
 	// Log what was attached
 	for i, probe := range setup.probeList {
@@ -257,6 +261,10 @@ func TestUSDTProbeWithEBPFSingle(t *testing.T) {
 // TestUSDTProbeWithEBPFMulti tests USDT probes using multi-probe attachment with cookies.
 // This mimics how CUDA probes work: one multi-probe program that dispatches based on cookie.
 func TestUSDTProbeWithEBPFMulti(t *testing.T) {
+	if !util.HasMultiUprobeSupport() {
+		t.Skip("This test requires kernel support for uprobe multi-attach")
+	}
+
 	setup := setupTest(t)
 	defer setup.cleanup()
 
@@ -272,12 +280,11 @@ func TestUSDTProbeWithEBPFMulti(t *testing.T) {
 		setup.probeList,
 		cookies, // cookies for dispatch (probe IDs 1-8)
 		nil,     // no individual programs
-		false,   // attach to current PID only
 	)
 	if err != nil {
 		t.Fatalf("failed to attach USDT probes: %v", err)
 	}
-	defer lc.Detach()
+	defer lc.Unload()
 
 	// Log what was attached
 	t.Logf("Attached multi-probe program usdt_test_multi to %d probes", len(setup.probeList))
diff --git a/test/distro-qemu/build-and-run.sh b/test/distro-qemu/build-and-run.sh
index 7334a2481..b68f45af3 100755
--- a/test/distro-qemu/build-and-run.sh
+++ b/test/distro-qemu/build-and-run.sh
@@ -14,6 +14,16 @@ CACHE_DIR="${CACHE_DIR:-/tmp/debootstrap-cache}"
 echo "Building rootfs with $DISTRO $RELEASE..."
 
 # Clean up previous builds
+# First, unmount any leftover mounts from previous debootstrap runs
+if [ -d "$ROOTFS_DIR" ]; then
+    echo "Cleaning up any mounted filesystems in $ROOTFS_DIR..."
+    # Find all mount points under ROOTFS_DIR and unmount them in reverse order (deepest first)
+    findmnt -o TARGET -n -l | grep "^$(pwd)/$ROOTFS_DIR" | sort -r | while read -r mountpoint; do
+        echo "  Unmounting $mountpoint"
+        sudo umount "$mountpoint" || sudo umount -l "$mountpoint" || true
+    done
+fi
+
 sudo rm -rf "$ROOTFS_DIR" "$OUTPUT_DIR"
 mkdir -p "$ROOTFS_DIR" "$OUTPUT_DIR" "$CACHE_DIR"
 
@@ -26,8 +36,14 @@ case "$QEMU_ARCH" in
     aarch64)
         DEBOOTSTRAP_ARCH="arm64"
         ;;
+    *)
+        echo "Unsupported QEMU_ARCH: $QEMU_ARCH"
+        exit 1
+        ;;
 esac
 
+GOARCH=$DEBOOTSTRAP_ARCH
+
 # Choose mirror based on distro and architecture
 if [[ "$DISTRO" == "ubuntu" ]]; then
     # Ubuntu ARM64 packages are on ports.ubuntu.com
@@ -45,7 +61,7 @@ echo "Running debootstrap to create $DISTRO $RELEASE rootfs for $DEBOOTSTRAP_ARC
 sudo debootstrap --variant=minbase \
     --arch="$DEBOOTSTRAP_ARCH" \
     --cache-dir="$CACHE_DIR" \
-    "$RELEASE" "$ROOTFS_DIR" "$MIRROR"
+    "$RELEASE" "$ROOTFS_DIR" "$MIRROR" || cat "$ROOTFS_DIR/debootstrap/debootstrap.log"
 
 # Change ownership of rootfs to current user to avoid needing sudo for subsequent operations
 sudo chown -R "$(id -u):$(id -g)" "$ROOTFS_DIR"
@@ -53,17 +69,6 @@ sudo chown -R "$(id -u):$(id -g)" "$ROOTFS_DIR"
 # Build the test binary (must be dynamic for dlopen to work)
 echo "Building test binary for $DISTRO $RELEASE $DEBOOTSTRAP_ARCH..."
 
-# Determine Go architecture
-GOARCH="amd64"
-case "$QEMU_ARCH" in
-    x86_64)
-        GOARCH="amd64"
-        ;;
-    aarch64)
-        GOARCH="arm64"
-        ;;
-esac
-
 # For cross-compilation or Ubuntu jammy/noble, local build works (host has compatible or newer glibc)
 # For older distros, would need Docker build (disabled by default for speed)
 if [[ "${USE_DOCKER}" == "1" ]] && command -v docker &> /dev/null; then
@@ -85,11 +90,16 @@ if [[ "${USE_DOCKER}" == "1" ]] && command -v docker &> /dev/null; then
                  wget -q https://go.dev/dl/go1.24.7.linux-${GOARCH}.tar.gz && \
                  tar -C /usr/local -xzf go1.24.7.linux-${GOARCH}.tar.gz && \
                  export PATH=/usr/local/go/bin:\$PATH && \
-                 CGO_ENABLED=1 go test -c ../../interpreter/rtld ../../support/usdt"
+                 CGO_ENABLED=1 go test -c ../../interpreter/rtld ../../support/usdt/test"
 else
     # Local build with cross-compilation if needed
     echo "Building locally for ${GOARCH}..."
-    CGO_ENABLED=1 GOARCH=${GOARCH} go test -c ../../interpreter/rtld ../../support/usdt
+    if [ "$GOARCH" = "arm64" ]; then
+        # Cross-compile for ARM64 using aarch64-linux-gnu-gcc
+        CGO_ENABLED=1 GOARCH=${GOARCH} CC=aarch64-linux-gnu-gcc go test -c ../../interpreter/rtld ../../support/usdt/test
+    else
+        CGO_ENABLED=1 GOARCH=${GOARCH} go test -c ../../interpreter/rtld ../../support/usdt/test
+    fi
 fi
 
 # Copy test binary into rootfs
@@ -129,7 +139,7 @@ export DEBUG_TEST=1
 
 # Run the tests
 echo ""
-/rtld.test -test.v && /usdt.test -test.v
+/rtld.test -test.v && /test.test -test.v
 RESULT=$?
 
 if [ $RESULT -eq 0 ]; then
@@ -204,7 +214,8 @@ echo ""
 echo "===== Starting QEMU with kernel ${KERNEL_VERSION} on ${QEMU_ARCH} ====="
 echo ""
 
-# Run QEMU
+# Run QEMU and capture output
+QEMU_OUTPUT=$(mktemp)
 ${sudo} qemu-system-${QEMU_ARCH} ${additionalQemuArgs} \
     -nographic \
     -monitor none \
@@ -214,15 +225,28 @@ ${sudo} qemu-system-${QEMU_ARCH} ${additionalQemuArgs} \
     -initrd "$OUTPUT_DIR/initramfs.gz" \
     -append "${CONSOLE_ARG} init=/init quiet loglevel=3" \
     -no-reboot \
-    -display none
+    -display none \
+    | tee "$QEMU_OUTPUT"
 
-EXIT_CODE=$?
-
-# QEMU with sysrq poweroff returns 0 on clean shutdown
-if [ $EXIT_CODE -eq 0 ]; then
+# Parse output for test result
+if grep -q "===== TEST PASSED =====" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
     echo "✅ Test completed successfully"
     exit 0
+elif grep -q "===== TEST FAILED" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Test failed"
+    exit 1
+elif grep -q "===== TEST TIMED OUT =====" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Test timed out"
+    exit 124
 else
-    echo "❌ Test failed with QEMU exit code $EXIT_CODE"
-    exit $EXIT_CODE
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Could not determine test result (QEMU may have crashed)"
+    exit 2
 fi
\ No newline at end of file
diff --git a/tools/coredump/ebpfmaps.go b/tools/coredump/ebpfmaps.go
index 548f2bf0d..ccf58a0b0 100644
--- a/tools/coredump/ebpfmaps.go
+++ b/tools/coredump/ebpfmaps.go
@@ -266,7 +266,7 @@ func (emc *ebpfMapsCoredump) SupportsLPMTrieBatchOperations() bool {
 }
 
 func (emc *ebpfMapsCoredump) AttachUSDTProbes(_ libpf.PID, _, _ string, _ []pfelf.USDTProbe,
-	_ []uint64, _ []string, _ bool) (interpreter.LinkCloser, error) {
+	_ []uint64, _ []string) (interpreter.LinkCloser, error) {
 	return nil, nil
 }
 
diff --git a/tracer/tracer.go b/tracer/tracer.go
index 3c5975ae7..d3ea3e2f7 100644
--- a/tracer/tracer.go
+++ b/tracer/tracer.go
@@ -1204,3 +1204,9 @@ func (t *Tracer) GetEbpfHandler() interpreter.EbpfHandler {
 func (t *Tracer) GetInterpretersForPID(pid libpf.PID) []interpreter.Instance {
 	return t.processManager.GetInterpretersForPID(pid)
 }
+
+// ForceProcessPID forces processing of the given PID by sending it to the
+// pidEvents channel. Used to speed up tests.
+func (t *Tracer) ForceProcessPID(pid libpf.PID) {
+	t.pidEvents <- libpf.PIDTID(uint64(pid) + uint64(pid)<<32)
+}
diff --git a/util/util.go b/util/util.go
index 375c87aba..60abcaa74 100644
--- a/util/util.go
+++ b/util/util.go
@@ -5,6 +5,7 @@ package util // import "go.opentelemetry.io/ebpf-profiler/util"
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"math/bits"
 	"strings"
@@ -16,6 +17,7 @@ import (
 
 	"github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
 	log "github.com/sirupsen/logrus"
 	"go.opentelemetry.io/ebpf-profiler/libpf/hash"
 	"golang.org/x/sys/unix"
@@ -106,6 +108,9 @@ var (
 	// multiUprobeSupportCache caches the result of probing for multi-uprobe support
 	multiUprobeSupportOnce   sync.Once
 	multiUprobeSupportCached bool
+	// bpfGetAttachCookieCache caches the result of probing for bpf_get_attach_cookie support
+	bpfGetAttachCookieOnce   sync.Once
+	bpfGetAttachCookieCached bool
 )
 
 // SetTestOnlyMultiUprobeSupport overrides HasMultiUprobeSupport for testing.
@@ -145,10 +150,67 @@ func probeBpfGetAttachCookie() bool {
 	return true
 }
 
+// HasBpfGetAttachCookie checks if the kernel supports the bpf_get_attach_cookie helper.
+// This function uses a cached, once-calculated value for performance.
+//
+// Note: This function requires CAP_BPF or CAP_SYS_ADMIN capabilities to load the probe
+// program. The profiler should already have these privileges.
+func HasBpfGetAttachCookie() bool {
+	bpfGetAttachCookieOnce.Do(func() {
+		bpfGetAttachCookieCached = probeBpfGetAttachCookie()
+	})
+
+	return bpfGetAttachCookieCached
+}
+
+// probeBpfUprobeMultiLink probes for uprobe_multi link support by attempting to create
+// an invalid uprobe_multi link. This is modeled after libbpf's probe_uprobe_multi_link
+// and cilium/ebpf's haveBPFLinkUprobeMulti which is not exposed publicly.
+//
+// Try to create a link to (invalid binary) which should fail with EBADF if supported
+func probeBpfUprobeMultiLink() bool {
+	prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
+		Name: "probe_upm_link",
+		Type: ebpf.Kprobe,
+		Instructions: asm.Instructions{
+			asm.Mov.Imm(asm.R0, 0),
+			asm.Return(),
+		},
+		AttachType: ebpf.AttachTraceUprobeMulti,
+		License:    "MIT",
+	})
+	if errors.Is(err, unix.E2BIG) {
+		// Kernel doesn't support AttachType field.
+		return false
+	}
+	if err != nil {
+		log.Warnf("Failed to create test program for uprobe_multi link probe: %v", err)
+		return false
+	}
+	defer prog.Close()
+
+	ex := link.Executable{}
+
+	_, err = ex.UprobeMulti([]string{""}, prog, &link.UprobeMultiOptions{
+		Addresses: []uint64{1},
+	})
+
+	if errors.Is(err, unix.EBADF) {
+		return true
+	}
+
+	if errors.Is(err, unix.EINVAL) {
+		return false
+	}
+
+	log.Warnf("Unexpected error when probing for uprobe_multi link support: %v", err)
+	return false
+}
+
 // HasMultiUprobeSupport checks if the kernel supports uprobe multi-attach.
-// Multi-uprobes are needed because single-shot uprobes don't work for shared libraries.
-// This function probes for bpf_get_attach_cookie support, which is required for
-// multi-uprobes and was introduced alongside them in kernel 6.6.
+// Multi-uprobes allow attaching one BPF program to multiple probe points with a single syscall,
+// which is more efficient than individual uprobe attachments.
+// This function probes for uprobe_multi link support, which was introduced in kernel 6.6.
 //
 // Note: This function requires CAP_BPF or CAP_SYS_ADMIN capabilities to load the probe
 // program. The profiler should already have these privileges.
@@ -158,7 +220,7 @@ func HasMultiUprobeSupport() bool {
 	}
 
 	multiUprobeSupportOnce.Do(func() {
-		multiUprobeSupportCached = probeBpfGetAttachCookie()
+		multiUprobeSupportCached = probeBpfUprobeMultiLink()
 	})
 
 	return multiUprobeSupportCached