diff --git a/.github/workflows/unit-test-on-pull-request.yml b/.github/workflows/unit-test-on-pull-request.yml
index 0a5f7922d..8bfb0dd6b 100644
--- a/.github/workflows/unit-test-on-pull-request.yml
+++ b/.github/workflows/unit-test-on-pull-request.yml
@@ -238,15 +238,28 @@ jobs:
           sudo go test ./interpreter/... -v -run TestIntegration
 
   distro-qemu-tests:
-    name: Full distro QEMU tests (kernel ${{ matrix.kernel }})
+    name: Full distro QEMU tests (kernel ${{ matrix.kernel }} ${{ matrix.target_arch }})
     runs-on: ubuntu-24.04
     timeout-minutes: 15
     strategy:
       matrix:
-        kernel:
-          #- 5.10.217  # 5.10 doesn't have bpf cookies
-          - 5.15.159
-          - 6.8.10    # Post-6.6, supports multi-uprobe
+        include:
+          - { target_arch: amd64, kernel: 5.4.276 }
+          - { target_arch: amd64, kernel: 5.10.217 }
+          - { target_arch: amd64, kernel: 5.15.159 }
+          - { target_arch: amd64, kernel: 6.1.91 }
+          - { target_arch: amd64, kernel: 6.6.31 }
+          - { target_arch: amd64, kernel: 6.8.10 }
+          - { target_arch: amd64, kernel: 6.9.1 }
+          - { target_arch: amd64, kernel: 6.12.16 }
+          - { target_arch: amd64, kernel: 6.16 }
+
+          # ARM64 (NOTE: older ARM64 kernels are not available in Cilium repos)
+          # TODO: get these working
+          #- { target_arch: arm64, kernel: 6.6.31 }
+          #- { target_arch: arm64, kernel: 6.8.4 }
+          #- { target_arch: arm64, kernel: 6.9.1 }
+          #- { target_arch: arm64, kernel: 6.12.16 }
     steps:
       - name: Clone code
         uses: actions/checkout@v4
@@ -258,12 +271,17 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update -y
-          sudo apt-get install -y qemu-system-x86 debootstrap systemtap-sdt-dev
+          case "${{ matrix.target_arch }}" in
+            amd64) sudo apt-get -y install qemu-system-x86;;
+            arm64) sudo apt-get -y install qemu-system-arm;;
+            *) echo >&2 "bug: bad arch selected"; exit 1;;
+          esac
+          sudo apt-get install -y debootstrap systemtap-sdt-dev
       - name: Download kernel
         run: |
           cd test/distro-qemu
           ./download-kernel.sh ${{ matrix.kernel }}
-      - name: Run RTLD tests in QEMU
+      - name: Run Full Distro tests in QEMU
         run: |
           cd test/distro-qemu
           ./build-and-run.sh ${{ matrix.kernel }}
diff --git a/interpreter/gpu/cuda.go b/interpreter/gpu/cuda.go
index a23f491ba..dac6301c6 100644
--- a/interpreter/gpu/cuda.go
+++ b/interpreter/gpu/cuda.go
@@ -50,7 +50,6 @@ type data struct {
 type Instance struct {
 	interpreter.InstanceStubs
 	path string
-	link interpreter.LinkCloser
 	pid  libpf.PID
 }
 
@@ -68,9 +67,9 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	if err != nil {
 		return nil, err
 	}
+
 	// We use the existence of the .note.stapsdt section to determine if this is a
-	// process that has libparcagpucupti.so loaded. Its cheaper and more reliable than loading
-	// the symbol table.
+	// process that has libparcagpucupti.so loaded.
 	probes, err := ef.ParseUSDTProbes()
 	if err != nil {
 		return nil, err
@@ -96,7 +95,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	return nil, nil
 }
 
-
 func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Address,
 	_ remotememory.RemoteMemory) (interpreter.Instance, error) {
 	// Maps usdt probe name to ebpf program name.
@@ -115,12 +113,19 @@ func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Addre
 			progNames[i] = "usdt_parcagpu_cuda_kernel"
 		}
 	}
-	lc, err := ebpf.AttachUSDTProbes(pid, d.path, "cuda_probe", d.probes, cookies, progNames, true)
-	if err != nil {
-		return nil, err
+
+	var lc interpreter.LinkCloser
+	if d.link == nil {
+		var err error
+		lc, err = ebpf.AttachUSDTProbes(pid, d.path, "cuda_probe", d.probes, cookies, progNames)
+		if err != nil {
+			return nil, err
+		}
+		log.Debugf("[cuda] parcagpu USDT probes attached for %s", d.path)
+		d.link = lc
+	} else {
+		log.Debugf("[cuda] parcagpu USDT probes already attached for %s", d.path)
 	}
-	log.Debugf("[cuda] parcagpu USDT probes attached for %s", d.path)
-	d.link = lc
 
 	// Create and register fixer for this PID
 	fixer := &gpuTraceFixer{
@@ -129,24 +134,14 @@ func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Addre
 	}
 
 	gpuFixers.Store(pid, fixer)
-
 	return &Instance{
-		link: lc,
 		path: d.path,
 		pid:  pid,
 	}, nil
 }
 
-// Detach removes the fixer for this PID and closes the link if needed.
 func (i *Instance) Detach(_ interpreter.EbpfHandler, _ libpf.PID) error {
 	gpuFixers.Delete(i.pid)
-
-	if i.link != nil {
-		log.Debugf("[cuda] parcagpu USDT probes closed for %s", i.path)
-		if err := i.link.Detach(); err != nil {
-			return err
-		}
-	}
 	return nil
 }
 
diff --git a/interpreter/instancestubs.go b/interpreter/instancestubs.go
index fd89ebd2d..0aa4f7f03 100644
--- a/interpreter/instancestubs.go
+++ b/interpreter/instancestubs.go
@@ -73,7 +73,7 @@ func (m *EbpfHandlerStubs) DeleteProcData(libpf.InterpreterType, libpf.PID) erro
 }
 
 func (mockup *EbpfHandlerStubs) AttachUSDTProbes(libpf.PID, string, string, []pfelf.USDTProbe,
-	[]uint64, []string, bool) (LinkCloser, error) {
+	[]uint64, []string) (LinkCloser, error) {
 	return nil, nil
 }
 
diff --git a/interpreter/rtld/rtld.go b/interpreter/rtld/rtld.go
index c8455c975..038b191c2 100644
--- a/interpreter/rtld/rtld.go
+++ b/interpreter/rtld/rtld.go
@@ -22,7 +22,6 @@ type data struct {
 // instance represents a per-PID instance of the dlopen interpreter
 type instance struct {
 	interpreter.InstanceStubs
-	lc interpreter.LinkCloser
 }
 
 // Loader detects if the ELF file contains the dlopen symbol in its dynamic symbol table
@@ -37,7 +36,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 	// Look for the dlopen symbol in the dynamic symbol table
 	sym, err := ef.LookupSymbol("dlopen")
 	if err != nil || sym == nil {
-		// No dlopen symbol found, this library doesn't support dynamic loading
 		return nil, nil
 	}
 
@@ -52,26 +50,21 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
 // Attach attaches the uprobe to the dlopen function
 func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, bias libpf.Address,
 	_ remotememory.RemoteMemory) (interpreter.Instance, error) {
-	// Attach uprobe to dlopen using the address stored during Loader
-	lc, err := ebpf.AttachUprobe(pid, d.path, d.address, "uprobe_dlopen")
-	if err != nil {
-		return nil, fmt.Errorf("failed to attach uprobe to dlopen: %w", err)
+	var lc interpreter.LinkCloser
+	if d.lc == nil {
+		// Attach uprobe to dlopen using the address stored during Loader
+		var err error
+		lc, err = ebpf.AttachUprobe(pid, d.path, d.address, "uprobe_dlopen")
+		if err != nil {
+			return nil, fmt.Errorf("failed to attach uprobe to dlopen: %w", err)
+		}
+		d.lc = lc
 	}
 
 	log.Debugf("[dlopen] Attached uprobe to dlopen for PID %d on %s at 0x%x",
 		pid, d.path, d.address)
 
-	d.lc = lc
-	return &instance{lc: lc}, nil
-}
-
-// Detach removes the uprobe
-func (i *instance) Detach(_ interpreter.EbpfHandler, pid libpf.PID) error {
-	log.Debugf("[dlopen] Detach called for PID %d", pid)
-	if i.lc != nil {
-		return i.lc.Detach()
-	}
-	return nil
+	return &instance{}, nil
 }
 
 // Unload cleans up the uprobe link
diff --git a/interpreter/rtld/rtld_test.go b/interpreter/rtld/rtld_test.go
index 67f576175..7eeb51e70 100644
--- a/interpreter/rtld/rtld_test.go
+++ b/interpreter/rtld/rtld_test.go
@@ -1,8 +1,6 @@
 // Copyright The OpenTelemetry Authors
 // SPDX-License-Identifier: Apache-2.0
 
-//go:build amd64 && !integration
-
 package rtld_test
 
 import (
@@ -15,6 +13,7 @@ import (
 	"github.com/coreos/pkg/dlopen"
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/ebpf-profiler/libpf"
 	"go.opentelemetry.io/ebpf-profiler/metrics"
 	"go.opentelemetry.io/ebpf-profiler/support"
 	"go.opentelemetry.io/ebpf-profiler/testutils"
@@ -23,22 +22,35 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/util"
 )
 
-func TestIntegration(t *testing.T) {
+func test(t *testing.T) {
 	if !testutils.IsRoot() {
 		t.Skip("This test requires root privileges")
 	}
 
+	// Enable debug logging for CI debugging
+	if os.Getenv("DEBUG_TEST") != "" {
+		log.SetLevel(log.DebugLevel)
+	}
+
 	// Create a context for the tracer
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
 
+	enabledTracers, err := tracertypes.Parse("RTLD")
+	require.NoError(t, err, "Failed to parse enabled tracers")
+
 	// Start the tracer with all tracers enabled
 	traceCh, trc := testutils.StartTracer(ctx, t,
-		tracertypes.AllTracers(),
+		enabledTracers,
 		&testutils.MockReporter{},
 		false)
 	defer trc.Close()
 
+	trc.StartPIDEventProcessor(ctx)
+
+	// tickle tihs process to speed things up
+	trc.ForceProcessPID(libpf.PID(uint32(os.Getpid())))
+
 	// Consume traces to prevent blocking
 	go func() {
 		for {
@@ -73,70 +85,20 @@ func TestIntegration(t *testing.T) {
 
 		// Check that the metric was incremented
 		return finalCount > initialCount
-	}, 10*time.Second, 50*time.Millisecond)
+	}, 10*time.Second, 100*time.Millisecond)
 }
 
-func TestIntegrationSingleShot(t *testing.T) {
-	if !testutils.IsRoot() {
-		t.Skip("This test requires root privileges")
-	}
-
-	// Enable debug logging for CI debugging
-	if os.Getenv("DEBUG_TEST") != "" {
-		log.SetLevel(log.DebugLevel)
-	}
+func TestIntegration(t *testing.T) {
+	test(t)
+}
 
-	// Override HasMultiUprobeSupport to force single-shot mode
+func TestIntegrationSingleShot(t *testing.T) {
+	// Override HasMultiUprobeSupport to force single-shot mode on newer kernels.
 	multiUProbeOverride := false
 	util.SetTestOnlyMultiUprobeSupport(&multiUProbeOverride)
 	defer util.SetTestOnlyMultiUprobeSupport(nil)
 
-	// Create a context for the tracer
-	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
-	defer cancel()
-
-	// Start the tracer with all tracers enabled
-	traceCh, trc := testutils.StartTracer(ctx, t,
-		tracertypes.AllTracers(),
-		&testutils.MockReporter{},
-		false)
-	defer trc.Close()
-
-	// Consume traces to prevent blocking
-	go func() {
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-traceCh:
-				// Discard traces
-			}
-		}
-	}()
-
-	// retry a few times to get the metric, our process has to be detected and
-	// the dlopen uprobe has to attach.
-	require.Eventually(t, func() bool {
-		// Get the initial metric value
-		initialCount := getEBPFMetricValue(trc, metrics.IDDlopenUprobeHits)
-		//t.Logf("Initial dlopen uprobe metric count: %d", initialCount)
-
-		// Use dlopen to load a shared library
-		// libm is a standard math library that's always present
-		lib, err := dlopen.GetHandle([]string{
-			"/lib/x86_64-linux-gnu/libm.so.6",
-			"libm.so.6",
-		})
-		require.NoError(t, err, "Failed to open libm.so.6")
-		defer lib.Close()
-
-		// Get the metrics after dlopen
-		finalCount := getEBPFMetricValue(trc, metrics.IDDlopenUprobeHits)
-		//t.Logf("Final dlopen uprobe metric count: %d", finalCount)
-
-		// Check that the metric was incremented
-		return finalCount > initialCount
-	}, 10*time.Second, 50*time.Millisecond)
+	test(t)
 }
 
 func getEBPFMetricValue(trc *tracer.Tracer, metricID metrics.MetricID) uint64 {
diff --git a/interpreter/types.go b/interpreter/types.go
index 0a4e77687..ef1698169 100644
--- a/interpreter/types.go
+++ b/interpreter/types.go
@@ -117,17 +117,13 @@ type EbpfHandler interface {
 	// AttachUSDTProbes attaches an eBPF program to USDT probes in the specified binary.
 	//
 	// Parameters:
-	//  - pid: The process ID. Required for older kernels (pre-6.6) that cannot attach to shared
-	//    libraries without a PID. On newer kernels with multi-uprobe support, this is ignored
-	//    when probeAll is true.
+	//  - pid: The process ID. Required for getting path to exe via procfs.
 	//  - path: Full path to the binary containing the USDT probes.
 	//  - multiProgName: Name of eBPF program to use for multi-uprobe attachment (newer kernels).
 	//  - probes: The USDT probe definitions to attach to.
 	//  - cookies: Optional cookies to pass to the eBPF program (one per probe, or nil).
 	//  - singleProgNames: eBPF program names for single-shot attachment (older kernels, one
 	//    per probe).
-	//  - probeAll: If true and the kernel supports it, attach to all processes using this
-	//    binary. If false, only attach to the specified pid.
 	//
 	// Returns:
 	//  - LinkCloser: A handle to the attached probes. The caller must:
@@ -136,14 +132,13 @@ type EbpfHandler interface {
 	//    2. Call LinkCloser.Detach() from Instance.Detach() to detach from the specific PID
 	//    3. Call LinkCloser.Unload() from Data.Unload() to fully clean up the eBPF program
 	AttachUSDTProbes(pid libpf.PID, path, multiProgName string, probes []pfelf.USDTProbe,
-		cookies []uint64, singleProgNames []string, probeAll bool) (LinkCloser, error)
+		cookies []uint64, singleProgNames []string) (LinkCloser, error)
 
 	// AttachUprobe attaches an eBPF uprobe to a function at a specific offset in a binary
 	AttachUprobe(pid libpf.PID, path string, offset uint64, progName string) (LinkCloser, error)
 }
 
 type LinkCloser interface {
-	Detach() error
 	Unload() error
 }
 
diff --git a/processmanager/ebpf/ebpf.go b/processmanager/ebpf/ebpf.go
index 1154e6d2e..208e46ed9 100644
--- a/processmanager/ebpf/ebpf.go
+++ b/processmanager/ebpf/ebpf.go
@@ -143,8 +143,9 @@ func LoadMaps(ctx context.Context, maps map[string]*cebpf.Map,
 }
 
 type linkCloser struct {
-	detachLink []link.Link
-	unloadLink link.Link
+	unloadLink    []link.Link
+	unloadSpecIDs []uint32   // spec IDs to delete when unload happens
+	specMap       *cebpf.Map // reference to the spec map for cleanup
 }
 
 // populateUSDTSpecMaps parses USDT probe arguments and populates the BPF spec maps.
@@ -185,29 +186,60 @@ func populateUSDTSpecMaps(probes []pfelf.USDTProbe, specMap *cebpf.Map, startSpe
 	return specIDs, nil
 }
 
-func (lc *linkCloser) Detach() error {
+/*
+	func (lc *linkCloser) Detach() error {
+		var errs []error
+		if lc.detachLink != nil {
+			for _, l := range lc.detachLink {
+				if err := l.Close(); err != nil {
+					errs = append(errs, err)
+				}
+			}
+		}
+		// Clean up spec IDs associated with detach
+		if lc.specMap != nil && len(lc.detachSpecIDs) > 0 {
+			for _, specID := range lc.detachSpecIDs {
+				if specID != 0 {
+					if err := lc.specMap.Delete(&specID); err != nil {
+						log.Debugf("Failed to delete spec ID %d from map: %v", specID, err)
+						errs = append(errs, err)
+					} else {
+						log.Debugf("Deleted spec ID %d from map during detach", specID)
+					}
+				}
+			}
+		}
+		return errors.Join(errs...)
+	}
+*/
+func (lc *linkCloser) Unload() error {
 	var errs []error
-	if lc.detachLink != nil {
-		for _, l := range lc.detachLink {
+	if lc.unloadLink != nil {
+		for _, l := range lc.unloadLink {
 			if err := l.Close(); err != nil {
 				errs = append(errs, err)
 			}
 		}
 	}
-	return errors.Join(errs...)
-}
-
-func (lc *linkCloser) Unload() error {
-	if lc.unloadLink != nil {
-		return lc.unloadLink.Close()
+	// Clean up spec IDs associated with unload
+	if lc.specMap != nil && len(lc.unloadSpecIDs) > 0 {
+		for _, specID := range lc.unloadSpecIDs {
+			if specID != 0 {
+				if err := lc.specMap.Delete(&specID); err != nil {
+					log.Debugf("Failed to delete spec ID %d from map: %v", specID, err)
+					errs = append(errs, err)
+				} else {
+					log.Debugf("Deleted spec ID %d from map during unload", specID)
+				}
+			}
+		}
 	}
-	return nil
+	return errors.Join(errs...)
 }
 
 // AttachUSDTProbes allows interpreters to attach to usdt probes.
 func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName string,
-	probes []pfelf.USDTProbe, cookies []uint64, singleProgNames []string,
-	probeAll bool) (interpreter.LinkCloser, error) {
+	probes []pfelf.USDTProbe, cookies []uint64, singleProgNames []string) (interpreter.LinkCloser, error) {
 	containerPath := fmt.Sprintf("/proc/%d/root/%s", pid, path)
 
 	// TODO: This will crack open the exe with debug.elf and read symbols, we should
@@ -286,6 +318,9 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 
 	useMulti := util.HasMultiUprobeSupport()
 
+	// Determine PID for attachment
+	attachPID := 0
+
 	// If multiProgName is empty or multi-probe not supported, use individual programs (one per probe)
 	if multiProgName == "" || !useMulti {
 		if singleProgNames == nil {
@@ -326,7 +361,7 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 			uprobeOpts := &link.UprobeOptions{
 				Address:      probe.Location,
 				RefCtrOffset: probe.SemaphoreOffset,
-				PID:          int(pid),
+				PID:          int(attachPID),
 			}
 
 			// Set cookie if provided
@@ -348,7 +383,11 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 		}
 
 		log.Infof("Attached %d individual probes to %s in PID %d", len(links), path, pid)
-		return &linkCloser{detachLink: links}, nil
+		return &linkCloser{
+			unloadLink:    links,
+			unloadSpecIDs: specIDs,
+			specMap:       impl.usdtSpecsMap,
+		}, nil
 	}
 
 	prog := impl.userProgs[multiProgName]
@@ -363,12 +402,6 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 		return nil, errors.New("attaching multiple probes requires multi support (kernel 6.6+)")
 	}
 
-	// Determine PID for attachment
-	attachPID := int(pid)
-	if probeAll {
-		attachPID = 0 // 0 means all processes
-	}
-
 	// Single probe with single program - use single uprobe
 	if len(probes) == 1 {
 		uprobeOpts := &link.UprobeOptions{
@@ -376,7 +409,7 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 			RefCtrOffset: probes[0].SemaphoreOffset,
 			PID:          attachPID,
 		}
-		if finalCookies != nil && len(finalCookies) > 0 {
+		if len(finalCookies) > 0 {
 			uprobeOpts.Cookie = finalCookies[0]
 		}
 
@@ -386,7 +419,11 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 				probes[0].Name, probes[0].Location, err)
 		}
 		log.Infof("Attached probe %s to usdt %s in PID %d", multiProgName, path, pid)
-		return &linkCloser{unloadLink: l}, nil
+		return &linkCloser{
+			unloadLink:    []link.Link{l},
+			unloadSpecIDs: specIDs,
+			specMap:       impl.usdtSpecsMap,
+		}, nil
 	}
 
 	// Multiple probes - use UprobeMulti
@@ -407,7 +444,11 @@ func (impl *ebpfMapsImpl) AttachUSDTProbes(pid libpf.PID, path, multiProgName st
 	}
 
 	log.Infof("Attached probe %s to usdt %s in PID %d", multiProgName, path, pid)
-	return &linkCloser{unloadLink: lnk}, nil
+	return &linkCloser{
+		unloadLink:    []link.Link{lnk},
+		unloadSpecIDs: specIDs,
+		specMap:       impl.usdtSpecsMap,
+	}, nil
 }
 
 // loadProgram loads an eBPF program from progSpec and populates the related maps.
@@ -475,27 +516,38 @@ func (impl *ebpfMapsImpl) AttachUprobe(pid libpf.PID, path string, offset uint64
 		impl.userProgs = make(map[string]*cebpf.Program)
 	}
 
+	useMulti := util.HasMultiUprobeSupport()
 	// Load the program if not already loaded
 	prog := impl.userProgs[progName]
 	if prog == nil {
-		if loadErr := impl.loadUSDTProgram(progName, false); loadErr != nil {
+		if loadErr := impl.loadUSDTProgram(progName, useMulti); loadErr != nil {
 			return nil, loadErr
 		}
 		prog = impl.userProgs[progName]
 	}
 
-	// Attach the uprobe
-	lnk, err := exe.Uprobe("", prog, &link.UprobeOptions{
-		Address: offset,
-		PID:     int(pid),
-	})
-	if err != nil {
-		return nil, fmt.Errorf("failed to attach uprobe to %s at offset 0x%x: %w",
-			path, offset, err)
+	var lnk link.Link
+	if useMulti {
+		// Attach uprobe with multi support
+		lnk, err = exe.UprobeMulti([]string{progName}, prog, &link.UprobeMultiOptions{
+			Addresses: []uint64{offset},
+		})
+		if err != nil {
+			return nil, fmt.Errorf("failed to attach uprobe-multi to %s at offset 0x%x: %w",
+				path, offset, err)
+		}
+	} else {
+		// Attach the uprobe
+		lnk, err = exe.Uprobe("", prog, &link.UprobeOptions{
+			Address: offset,
+		})
+		if err != nil {
+			return nil, fmt.Errorf("failed to attach uprobe to %s at offset 0x%x: %w",
+				path, offset, err)
+		}
 	}
-
 	log.Infof("Attached uprobe %s to %s at offset 0x%x in PID %d", progName, path, offset, pid)
-	return &linkCloser{detachLink: []link.Link{lnk}}, nil
+	return &linkCloser{unloadLink: []link.Link{lnk}}, nil
 }
 
 func (impl *ebpfMapsImpl) CoredumpTest() bool {
diff --git a/processmanager/execinfomanager/manager.go b/processmanager/execinfomanager/manager.go
index 4cc4a3169..92f7314c0 100644
--- a/processmanager/execinfomanager/manager.go
+++ b/processmanager/execinfomanager/manager.go
@@ -140,10 +140,18 @@ func NewExecutableInfoManager(
 	if includeTracers.Has(types.Labels) {
 		interpreterLoaders = append(interpreterLoaders, golabels.Loader, customlabels.Loader)
 	}
-	interpreterLoaders = append(interpreterLoaders, oomwatcher.Loader, rtld.Loader)
+	if includeTracers.Has(types.RTLD) {
+		interpreterLoaders = append(interpreterLoaders, rtld.Loader)
+	}
+	interpreterLoaders = append(interpreterLoaders, oomwatcher.Loader)
 
 	if includeTracers.Has(types.CUDATracer) {
-		interpreterLoaders = append(interpreterLoaders, gpu.Loader)
+		// USDT support requires cookies
+		if util.HasBpfGetAttachCookie() {
+			interpreterLoaders = append(interpreterLoaders, gpu.Loader)
+		} else {
+			log.Warn("CUDA USDT tracing is not supported on this kernel (missing bpf_get_attach_cookie)")
+		}
 	}
 
 	deferredFileIDs, err := lru.NewSynced[host.FileID, libpf.Void](deferredFileIDSize,
diff --git a/support/usdt/test/usdt_integration_test.go b/support/usdt/test/usdt_integration_test.go
index 72792800e..7389f5092 100644
--- a/support/usdt/test/usdt_integration_test.go
+++ b/support/usdt/test/usdt_integration_test.go
@@ -21,6 +21,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/reporter"
 	"go.opentelemetry.io/ebpf-profiler/tracer"
 	tracertypes "go.opentelemetry.io/ebpf-profiler/tracer/types"
+	"go.opentelemetry.io/ebpf-profiler/util"
 )
 
 type mockIntervals struct{}
@@ -36,15 +37,15 @@ func (mockReporter) ExecutableMetadata(_ *reporter.ExecutableMetadataArgs) {}
 
 // testSetup encapsulates all the common test setup
 type testSetup struct {
-	t            *testing.T
-	testBinary   string
-	testProbes   map[string]pfelf.USDTProbe
-	probeList    []pfelf.USDTProbe
-	tracer       *tracer.Tracer
-	ebpfHandler  interpreter.EbpfHandler
-	resultsMap   *cebpf.Map
-	ctx          context.Context
-	cancelFunc   context.CancelFunc
+	t           *testing.T
+	testBinary  string
+	testProbes  map[string]pfelf.USDTProbe
+	probeList   []pfelf.USDTProbe
+	tracer      *tracer.Tracer
+	ebpfHandler interpreter.EbpfHandler
+	resultsMap  *cebpf.Map
+	ctx         context.Context
+	cancelFunc  context.CancelFunc
 }
 
 // setupTest performs all common initialization for USDT integration tests
@@ -53,6 +54,10 @@ func setupTest(t *testing.T) *testSetup {
 		t.Skip("This test requires root privileges to load eBPF programs")
 	}
 
+	if !util.HasBpfGetAttachCookie() {
+		t.Skip("This test requires kernel support for bpf_get_attach_cookie")
+	}
+
 	// Get the test binary path
 	testBinary, err := os.Executable()
 	if err != nil {
@@ -218,14 +223,14 @@ func TestUSDTProbeWithEBPFSingle(t *testing.T) {
 
 	// Individual program names for each probe
 	progNames := []string{
-		"usdt_simple_probe",
-		"usdt_memory_probe",
-		"usdt_const_probe",
-		"usdt_mixed_probe",
-		"usdt_int32_args",
-		"usdt_int64_args",
-		"usdt_mixed_refs",
-		"usdt_uint8_args",
+		"simple_probe",
+		"memory_probe",
+		"const_probe",
+		"mixed_probe",
+		"int32_args",
+		"int64_args",
+		"mixed_refs",
+		"uint8_args",
 	}
 
 	// Attach USDT probes with individual programs
@@ -237,12 +242,11 @@ func TestUSDTProbeWithEBPFSingle(t *testing.T) {
 		setup.probeList,
 		nil, // no user cookies, just spec IDs
 		progNames,
-		false, // attach to current PID only
 	)
 	if err != nil {
 		t.Fatalf("failed to attach USDT probes: %v", err)
 	}
-	defer lc.Detach()
+	defer lc.Unload()
 
 	// Log what was attached
 	for i, probe := range setup.probeList {
@@ -260,6 +264,10 @@ func TestUSDTProbeWithEBPFMulti(t *testing.T) {
 	setup := setupTest(t)
 	defer setup.cleanup()
 
+	if !util.HasMultiUprobeSupport() {
+		t.Skip("This test requires kernel support for uprobe multi-attach")
+	}
+
 	// Use probe IDs (1-8) as cookies for dispatch in the multi-probe program
 	cookies := []uint64{1, 2, 3, 4, 5, 6, 7, 8}
 
@@ -272,12 +280,11 @@ func TestUSDTProbeWithEBPFMulti(t *testing.T) {
 		setup.probeList,
 		cookies, // cookies for dispatch (probe IDs 1-8)
 		nil,     // no individual programs
-		false,   // attach to current PID only
 	)
 	if err != nil {
 		t.Fatalf("failed to attach USDT probes: %v", err)
 	}
-	defer lc.Detach()
+	defer lc.Unload()
 
 	// Log what was attached
 	t.Logf("Attached multi-probe program usdt_test_multi to %d probes", len(setup.probeList))
diff --git a/test/distro-qemu/build-and-run.sh b/test/distro-qemu/build-and-run.sh
index 7334a2481..956ac6703 100755
--- a/test/distro-qemu/build-and-run.sh
+++ b/test/distro-qemu/build-and-run.sh
@@ -14,6 +14,16 @@ CACHE_DIR="${CACHE_DIR:-/tmp/debootstrap-cache}"
 echo "Building rootfs with $DISTRO $RELEASE..."
 
 # Clean up previous builds
+# First, unmount any leftover mounts from previous debootstrap runs
+if [ -d "$ROOTFS_DIR" ]; then
+    echo "Cleaning up any mounted filesystems in $ROOTFS_DIR..."
+    # Find all mount points under ROOTFS_DIR and unmount them in reverse order (deepest first)
+    findmnt -o TARGET -n -l | grep "^$(pwd)/$ROOTFS_DIR" | sort -r | while read -r mountpoint; do
+        echo "  Unmounting $mountpoint"
+        sudo umount "$mountpoint" || sudo umount -l "$mountpoint" || true
+    done
+fi
+
 sudo rm -rf "$ROOTFS_DIR" "$OUTPUT_DIR"
 mkdir -p "$ROOTFS_DIR" "$OUTPUT_DIR" "$CACHE_DIR"
 
@@ -85,11 +95,16 @@ if [[ "${USE_DOCKER}" == "1" ]] && command -v docker &> /dev/null; then
                  wget -q https://go.dev/dl/go1.24.7.linux-${GOARCH}.tar.gz && \
                  tar -C /usr/local -xzf go1.24.7.linux-${GOARCH}.tar.gz && \
                  export PATH=/usr/local/go/bin:\$PATH && \
-                 CGO_ENABLED=1 go test -c ../../interpreter/rtld ../../support/usdt"
+                 CGO_ENABLED=1 go test -c ../../interpreter/rtld ../../support/usdt/test"
 else
     # Local build with cross-compilation if needed
     echo "Building locally for ${GOARCH}..."
-    CGO_ENABLED=1 GOARCH=${GOARCH} go test -c ../../interpreter/rtld ../../support/usdt
+    if [ "$GOARCH" = "arm64" ]; then
+        # Cross-compile for ARM64 using aarch64-linux-gnu-gcc
+        CGO_ENABLED=1 GOARCH=${GOARCH} CC=aarch64-linux-gnu-gcc go test -c ../../interpreter/rtld ../../support/usdt/test
+    else
+        CGO_ENABLED=1 GOARCH=${GOARCH} go test -c ../../interpreter/rtld ../../support/usdt/test
+    fi
 fi
 
 # Copy test binary into rootfs
@@ -129,7 +144,7 @@ export DEBUG_TEST=1
 
 # Run the tests
 echo ""
-/rtld.test -test.v && /usdt.test -test.v
+/rtld.test -test.v && /test.test -test.v
 RESULT=$?
 
 if [ $RESULT -eq 0 ]; then
@@ -204,7 +219,8 @@ echo ""
 echo "===== Starting QEMU with kernel ${KERNEL_VERSION} on ${QEMU_ARCH} ====="
 echo ""
 
-# Run QEMU
+# Run QEMU and capture output
+QEMU_OUTPUT=$(mktemp)
 ${sudo} qemu-system-${QEMU_ARCH} ${additionalQemuArgs} \
     -nographic \
     -monitor none \
@@ -214,15 +230,28 @@ ${sudo} qemu-system-${QEMU_ARCH} ${additionalQemuArgs} \
     -initrd "$OUTPUT_DIR/initramfs.gz" \
     -append "${CONSOLE_ARG} init=/init quiet loglevel=3" \
     -no-reboot \
-    -display none
-
-EXIT_CODE=$?
+    -display none \
+    | tee "$QEMU_OUTPUT"
 
-# QEMU with sysrq poweroff returns 0 on clean shutdown
-if [ $EXIT_CODE -eq 0 ]; then
+# Parse output for test result
+if grep -q "===== TEST PASSED =====" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
     echo "✅ Test completed successfully"
     exit 0
+elif grep -q "===== TEST FAILED" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Test failed"
+    exit 1
+elif grep -q "===== TEST TIMED OUT =====" "$QEMU_OUTPUT"; then
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Test timed out"
+    exit 124
 else
-    echo "❌ Test failed with QEMU exit code $EXIT_CODE"
-    exit $EXIT_CODE
+    rm -f "$QEMU_OUTPUT"
+    echo ""
+    echo "❌ Could not determine test result (QEMU may have crashed)"
+    exit 2
 fi
\ No newline at end of file
diff --git a/tools/coredump/ebpfmaps.go b/tools/coredump/ebpfmaps.go
index 548f2bf0d..ccf58a0b0 100644
--- a/tools/coredump/ebpfmaps.go
+++ b/tools/coredump/ebpfmaps.go
@@ -266,7 +266,7 @@ func (emc *ebpfMapsCoredump) SupportsLPMTrieBatchOperations() bool {
 }
 
 func (emc *ebpfMapsCoredump) AttachUSDTProbes(_ libpf.PID, _, _ string, _ []pfelf.USDTProbe,
-	_ []uint64, _ []string, _ bool) (interpreter.LinkCloser, error) {
+	_ []uint64, _ []string) (interpreter.LinkCloser, error) {
 	return nil, nil
 }
 
diff --git a/tracer/tracer.go b/tracer/tracer.go
index 3c5975ae7..1a3b010fa 100644
--- a/tracer/tracer.go
+++ b/tracer/tracer.go
@@ -1204,3 +1204,7 @@ func (t *Tracer) GetEbpfHandler() interpreter.EbpfHandler {
 func (t *Tracer) GetInterpretersForPID(pid libpf.PID) []interpreter.Instance {
 	return t.processManager.GetInterpretersForPID(pid)
 }
+
+func (t *Tracer) ForceProcessPID(pid libpf.PID) {
+	t.pidEvents <- libpf.PIDTID(uint64(pid) + uint64(pid)<<32)
+}
diff --git a/tracer/types/parse.go b/tracer/types/parse.go
index 775cde3d4..0295185b2 100644
--- a/tracer/types/parse.go
+++ b/tracer/types/parse.go
@@ -25,6 +25,7 @@ const (
 	LuaJITTracer
 	GoTracer
 	Labels
+	RTLD
 	CUDATracer
 
 	// maxTracers indicates the max. number of different tracers
@@ -42,6 +43,7 @@ var tracerTypeToName = map[tracerType]string{
 	LuaJITTracer:  "luajit",
 	GoTracer:      "go",
 	Labels:        "labels",
+	RTLD:          "rtld",
 	CUDATracer:    "cuda",
 }
 
diff --git a/util/util.go b/util/util.go
index 375c87aba..e64db0aac 100644
--- a/util/util.go
+++ b/util/util.go
@@ -5,6 +5,7 @@ package util // import "go.opentelemetry.io/ebpf-profiler/util"
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"math/bits"
 	"strings"
@@ -16,6 +17,7 @@ import (
 
 	"github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
 	log "github.com/sirupsen/logrus"
 	"go.opentelemetry.io/ebpf-profiler/libpf/hash"
 	"golang.org/x/sys/unix"
@@ -106,6 +108,9 @@ var (
 	// multiUprobeSupportCache caches the result of probing for multi-uprobe support
 	multiUprobeSupportOnce   sync.Once
 	multiUprobeSupportCached bool
+	// bpfGetAttachCookieCache caches the result of probing for bpf_get_attach_cookie support
+	bpfGetAttachCookieOnce   sync.Once
+	bpfGetAttachCookieCached bool
 )
 
 // SetTestOnlyMultiUprobeSupport overrides HasMultiUprobeSupport for testing.
@@ -145,10 +150,99 @@ func probeBpfGetAttachCookie() bool {
 	return true
 }
 
+// HasBpfGetAttachCookie checks if the kernel supports the bpf_get_attach_cookie helper.
+// This function uses a cached, once-calculated value for performance.
+//
+// Note: This function requires CAP_BPF or CAP_SYS_ADMIN capabilities to load the probe
+// program. The profiler should already have these privileges.
+func HasBpfGetAttachCookie() bool {
+	bpfGetAttachCookieOnce.Do(func() {
+		bpfGetAttachCookieCached = probeBpfGetAttachCookie()
+	})
+
+	return bpfGetAttachCookieCached
+}
+
+// probeBpfUprobeMultiLink probes for uprobe_multi link support by attempting to create
+// an invalid uprobe_multi link. This is modeled after libbpf's probe_uprobe_multi_link.
+//
+// The probe works in two steps:
+// 1. Try to create a link to "/" (invalid binary) which should fail with EBADF if supported
+// 2. Verify PID filtering works correctly by testing with pid=-1 (should fail with EINVAL)
+//
+// The second check is important because early kernel versions had broken PID filtering
+// (they did thread filtering instead of process filtering).
+func probeBpfUprobeMultiLink() bool {
+	// Create a minimal program with BPF_TRACE_UPROBE_MULTI expected attach type
+	insns := asm.Instructions{
+		asm.Mov.Imm(asm.R0, 0),
+		asm.Return(),
+	}
+
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.Kprobe,
+		Instructions: insns,
+		License:      "GPL",
+		AttachType:   ebpf.AttachTraceUprobeMulti,
+		AttachTo:     "",
+	}
+
+	prog, err := ebpf.NewProgramWithOptions(spec, ebpf.ProgramOptions{
+		LogDisabled: true,
+	})
+	if err != nil {
+		return false
+	}
+	defer func() {
+		if err := prog.Close(); err != nil {
+			log.Warnf("Failed to close probe program: %v", err)
+		}
+	}()
+
+	// Creating uprobe in '/' binary should fail with EBADF if uprobe_multi is supported
+	ex, err := link.OpenExecutable("/")
+	if err != nil {
+		return false
+	}
+
+	offset := uint64(0)
+	opts := &link.UprobeMultiOptions{
+		Addresses: []uint64{offset},
+	}
+
+	lnk, err := ex.UprobeMulti(nil, prog, opts)
+	if err == nil {
+		// Unexpectedly succeeded, clean up and return false
+		_ = lnk.Close()
+		return false
+	}
+	// Check if we got EBADF (expected error for invalid binary with uprobe_multi support)
+	if !errors.Is(err, unix.EBADF) {
+		return false
+	}
+
+	// Verify PID filtering works correctly. Initial multi-uprobe support in kernel
+	// didn't handle PID filtering correctly (it was doing thread filtering, not process
+	// filtering). We need to be conservative here because multi-uprobe selection happens
+	// early at load time, while the use of PID filtering is known late at attachment time.
+	//
+	// Creating uprobe with pid == -1 (invalid PID) for '/' binary should fail with EINVAL
+	// on kernels with fixed PID filtering logic; otherwise ESRCH or EBADF would be returned.
+	opts.PID = ^uint32(0) // -1 as unsigned
+	lnk, err = ex.UprobeMulti(nil, prog, opts)
+	if err == nil {
+		// Unexpectedly succeeded, clean up and return false
+		_ = lnk.Close()
+		return false
+	}
+
+	// We expect EINVAL for invalid PID on kernels with proper PID filtering
+	return errors.Is(err, unix.EINVAL)
+}
+
 // HasMultiUprobeSupport checks if the kernel supports uprobe multi-attach.
 // Multi-uprobes are needed because single-shot uprobes don't work for shared libraries.
-// This function probes for bpf_get_attach_cookie support, which is required for
-// multi-uprobes and was introduced alongside them in kernel 6.6.
+// This function probes for uprobe_multi link support, which was introduced in kernel 6.6.
 //
 // Note: This function requires CAP_BPF or CAP_SYS_ADMIN capabilities to load the probe
 // program. The profiler should already have these privileges.
@@ -158,7 +252,7 @@ func HasMultiUprobeSupport() bool {
 	}
 
 	multiUprobeSupportOnce.Do(func() {
-		multiUprobeSupportCached = probeBpfGetAttachCookie()
+		multiUprobeSupportCached = probeBpfUprobeMultiLink()
 	})
 
 	return multiUprobeSupportCached