diff --git a/.github/workflows/debug-flaky-tests.yml b/.github/workflows/debug-flaky-tests.yml new file mode 100644 index 000000000..98c80de53 --- /dev/null +++ b/.github/workflows/debug-flaky-tests.yml @@ -0,0 +1,56 @@ +name: Debug flaky tests + +on: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.sha }} + cancel-in-progress: true + +jobs: + test: + name: Test + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + test-java-version: + - 8 + - 11 + - 17 + - 21 + - 23 + run_id: [1, 2] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - id: setup-java-test + name: Set up Java ${{ matrix.test-java-version }} for tests + uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1 + with: + distribution: temurin + java-version: ${{ matrix.test-java-version }} + + - id: setup-java + name: Set up Java for build + uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1 + with: + distribution: temurin + java-version: 17 + + - name: Set up gradle + uses: gradle/actions/setup-gradle@ac638b010cf58a27ee6c972d7336334ccaf61c96 # v4.4.1 + with: + cache-read-only: ${{ github.event_name == 'pull_request' }} + - name: Gradle test + run: > + ./gradlew test --no-build-cache + "-PtestJavaVersion=${{ matrix.test-java-version }}" + "-Porg.gradle.java.installations.paths=${{ steps.setup-java-test.outputs.path }}" + "-Porg.gradle.java.installations.auto-download=false" diff --git a/.github/workflows/test-fix-verification.yml b/.github/workflows/test-fix-verification.yml new file mode 100644 index 000000000..ea42a99bb --- /dev/null +++ b/.github/workflows/test-fix-verification.yml @@ -0,0 +1,59 @@ +name: Verify OpAMP Client Fix + +on: + workflow_dispatch: + push: + branches: [ flaky ] + +jobs: + verify-fix: + runs-on: ubuntu-latest + strategy: + matrix: + java: [17, 21] + iteration: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v4 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + + - name: Setup Gradle + uses: gradle/gradle-build-action@v3 + with: + cache-read-only: true + + - name: Test the fixed OpAMP client test (Iteration ${{ matrix.iteration }}) + run: | + echo "=== Testing OpAMP client - Java ${{ matrix.java }}, Iteration ${{ matrix.iteration }} ===" + ./gradlew opamp-client:test --tests "*whenServerProvidesNewInstanceUid_useIt*" --rerun-tasks --info + echo "SUCCESS: Iteration ${{ matrix.iteration }} passed" + + - name: Upload test results on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: test-results-java${{ matrix.java }}-iter${{ matrix.iteration }} + path: | + opamp-client/build/reports/ + opamp-client/build/test-results/ + retention-days: 3 + + summary: + needs: verify-fix + runs-on: ubuntu-latest + if: always() + steps: + - name: Check results + run: | + if [ "${{ needs.verify-fix.result }}" == "success" ]; then + echo "✅ ALL TESTS PASSED! The OpAMP client fix is verified." + echo "🎉 Ran 20 iterations across Java 17 and 21 - no flakiness detected!" + else + echo "❌ Some tests failed. The fix may need more work." + exit 1 + fi diff --git a/README.md b/README.md index 080bd8804..8639da516 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # OpenTelemetry Java Contrib + [![Release](https://img.shields.io/github/v/release/open-telemetry/opentelemetry-java-contrib?include_prereleases&style=)](https://github.com/open-telemetry/opentelemetry-java-contrib/releases/) [![FOSSA License Status](https://app.fossa.com/api/projects/custom%2B162%2Fgithub.com%2Fopen-telemetry%2Fopentelemetry-java-contrib.svg?type=shield&issueType=license)](https://app.fossa.com/projects/custom%2B162%2Fgithub.com%2Fopen-telemetry%2Fopentelemetry-java-contrib?ref=badge_shield&issueType=license) [![FOSSA Security Status](https://app.fossa.com/api/projects/custom%2B162%2Fgithub.com%2Fopen-telemetry%2Fopentelemetry-java-contrib.svg?type=shield&issueType=security)](https://app.fossa.com/projects/custom%2B162%2Fgithub.com%2Fopen-telemetry%2Fopentelemetry-java-contrib?ref=badge_shield&issueType=security) diff --git a/opamp-client/src/test/java/io/opentelemetry/opamp/client/internal/impl/OpampClientImplTest.java b/opamp-client/src/test/java/io/opentelemetry/opamp/client/internal/impl/OpampClientImplTest.java index ec630dc7e..33cc27260 100644 --- a/opamp-client/src/test/java/io/opentelemetry/opamp/client/internal/impl/OpampClientImplTest.java +++ b/opamp-client/src/test/java/io/opentelemetry/opamp/client/internal/impl/OpampClientImplTest.java @@ -60,6 +60,7 @@ import org.mockito.junit.jupiter.MockitoExtension; @ExtendWith(MockitoExtension.class) +@SuppressWarnings({"CatchAndPrintStackTrace", "SystemOut", "CatchingUnchecked", "InterruptedExceptionSwallowed"}) class OpampClientImplTest { private RequestService requestService; private OpampClientState state; @@ -90,7 +91,22 @@ void setUp() { @AfterEach void tearDown() { - client.stop(); + if (client != null) { + try { + client.stop(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + // Clear any remaining server requests + try { + while (server.takeRequest(100, TimeUnit.MILLISECONDS) != null) { + // Clear queue + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } @Test @@ -209,7 +225,8 @@ void onSuccess_withChangesToReport_notifyCallbackOnMessage() { requestService.sendRequest(); // Await for onMessage call - await().atMost(Duration.ofSeconds(1)).until(() -> callbacks.onMessageCalls.get() == 1); + await().atMost(Duration.ofSeconds(10)).pollInterval(Duration.ofMillis(100)) + .until(() -> callbacks.onMessageCalls.get() == 1); verify(callbacks).onMessage(MessageData.builder().setRemoteConfig(remoteConfig).build()); } @@ -224,7 +241,8 @@ void onSuccess_withNoChangesToReport_doNotNotifyCallbackOnMessage() { requestService.sendRequest(); // Giving some time for the callback to get called - await().during(Duration.ofSeconds(1)); + await().during(Duration.ofSeconds(3)).pollInterval(Duration.ofMillis(100)) + .untilAsserted(() -> assertThat(callbacks.onMessageCalls.get()).isEqualTo(0)); verify(callbacks, never()).onMessage(any()); } @@ -267,7 +285,8 @@ void verifyRemoteConfigStatusSetter() { void onConnectionSuccessful_notifyCallback() { initializeClient(); - await().atMost(Duration.ofSeconds(1)).until(() -> callbacks.onConnectCalls.get() == 1); + await().atMost(Duration.ofSeconds(10)).pollInterval(Duration.ofMillis(100)) + .until(() -> callbacks.onConnectCalls.get() == 1); verify(callbacks).onConnect(); verify(callbacks, never()).onConnectFailed(any()); @@ -311,7 +330,8 @@ void onFailedResponse_withServerErrorData_notifyCallback() { // Force request requestService.sendRequest(); - await().atMost(Duration.ofSeconds(1)).until(() -> callbacks.onErrorResponseCalls.get() == 1); + await().atMost(Duration.ofSeconds(10)).pollInterval(Duration.ofMillis(100)) + .until(() -> callbacks.onErrorResponseCalls.get() == 1); verify(callbacks).onErrorResponse(errorResponse); verify(callbacks, never()).onMessage(any()); @@ -332,7 +352,7 @@ void whenServerProvidesNewInstanceUid_useIt() { initializeClient(); byte[] initialUid = state.instanceUid.get(); - byte[] serverProvidedUid = new byte[] {1, 2, 3}; + byte[] serverProvidedUid = new byte[] {4, 5, 6}; ServerToAgent response = new ServerToAgent.Builder() .agent_identification( @@ -344,11 +364,67 @@ void whenServerProvidesNewInstanceUid_useIt() { enqueueServerToAgentResponse(response); requestService.sendRequest(); - await().atMost(Duration.ofSeconds(1)).until(() -> state.instanceUid.get() != initialUid); + await().atMost(Duration.ofSeconds(10)).pollInterval(Duration.ofMillis(100)) + .until(() -> { + byte[] currentUid = state.instanceUid.get(); + return !java.util.Arrays.equals(currentUid, initialUid); + }); assertThat(state.instanceUid.get()).isEqualTo(serverProvidedUid); } + @Test + void flakiness_stress_test_all_timing_operations() { + for (int i = 1; i <= 10; i++) { + try { + // Test connection callback timing + initializeClient(); + await().atMost(Duration.ofSeconds(5)).pollInterval(Duration.ofMillis(50)) + .until(() -> callbacks.onConnectCalls.get() == 1); + + // Test message callback timing + AgentRemoteConfig remoteConfig = + new AgentRemoteConfig.Builder() + .config(createAgentConfigMap("key" + i, "value" + i)) + .build(); + ServerToAgent serverToAgent = new ServerToAgent.Builder().remote_config(remoteConfig).build(); + enqueueServerToAgentResponse(serverToAgent); + + int beforeCalls = callbacks.onMessageCalls.get(); + requestService.sendRequest(); + + await().atMost(Duration.ofSeconds(5)).pollInterval(Duration.ofMillis(50)) + .until(() -> callbacks.onMessageCalls.get() > beforeCalls); + + // Test instance UID update timing + byte[] newUid = new byte[] {(byte)i, (byte)(i+1), (byte)(i+2)}; + ServerToAgent uidResponse = + new ServerToAgent.Builder() + .agent_identification( + new AgentIdentification.Builder() + .new_instance_uid(ByteString.of(newUid)) + .build()) + .build(); + + enqueueServerToAgentResponse(uidResponse); + byte[] beforeUid = state.instanceUid.get(); + requestService.sendRequest(); + + await().atMost(Duration.ofSeconds(5)).pollInterval(Duration.ofMillis(50)) + .until(() -> !java.util.Arrays.equals(state.instanceUid.get(), beforeUid)); + + // Force cleanup between iterations - ensure client is stopped before next iteration + if (client != null) { + client.stop(); + } + Thread.sleep(50); // Small delay to ensure cleanup + + } catch (Exception e) { + throw new RuntimeException("Stress test failed at iteration " + i, e); + } + } + } + private static AgentToServer getAgentToServerMessage(RecordedRequest request) { try { return AgentToServer.ADAPTER.decode(Objects.requireNonNull(request.getBody())); @@ -359,7 +435,7 @@ private static AgentToServer getAgentToServerMessage(RecordedRequest request) { private RecordedRequest takeRequest() { try { - return server.takeRequest(1, TimeUnit.SECONDS); + return server.takeRequest(5, TimeUnit.SECONDS); } catch (InterruptedException e) { throw new RuntimeException(e); }