Skip to content

Commit b384fcf

Browse files
authored
Fix Lambda canary trace validation failure (#345)
*Issue description:* When searching the trace by API GetTraceSummaries, if the end time of searching is set to be far away from the actual trace end time, xray does not return Lambda traces. Right now I am not sure it is an aws sdk xray client issue or xray service issue. This PR is a workaround by moving trace validation step to be earlier, so the endTime of searching, which the value is the current time, is close to the actual trace end time. Then xray service can return the Lambda trace and pass the trace validation. *Description of changes:* *Rollback procedure:* <Can we safely revert this commit if needed? If not, detail what must be done to safely revert and why it is needed.> *Ensure you've run the following tests on your changes and include the link below:* To do so, create a `test.yml` file with `name: Test` and workflow description to test your changes, then remove the file for your PR. Link your test run in your PR description. This process is a short term solution while we work on creating a staging environment for testing. NOTE: TESTS RUNNING ON A SINGLE EKS CLUSTER CANNOT BE RUN IN PARALLEL. See the [needs](https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idneeds) keyword to run tests in succession. - Run Java EKS on `e2e-playground` in us-east-1 and eu-central-2 - Run Python EKS on `e2e-playground` in us-east-1 and eu-central-2 - Run metric limiter on EKS cluster `e2e-playground` in us-east-1 and eu-central-2 - Run EC2 tests in all regions - Run K8s on a separate K8s cluster (check IAD test account for master node endpoints; these will change as we create and destroy clusters for OS patching) By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent 926d53f commit b384fcf

File tree

4 files changed

+27
-30
lines changed

4 files changed

+27
-30
lines changed

.github/workflows/node-lambda-test.yml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,11 @@ jobs:
148148
shell: bash
149149
run: sleep 30s; curl -sS ${{ env.API_GATEWAY_URL }}
150150

151-
# Validation for pulse telemetry data
152-
- name: Validate generated EMF logs
153-
id: log-validation
151+
- name: Validate generated traces
152+
id: trace-validation
154153
# will be removed after data quality bug fixed
155154
continue-on-error: true
156-
run: ./gradlew validator:run --args='-c node/lambda/log-validation.yml
155+
run: ./gradlew validator:run --args='-c node/lambda/trace-validation.yml
157156
--testing-id ${{ env.TESTING_ID }}
158157
--endpoint http://${{ env.API_GATEWAY_URL }}
159158
--region ${{ inputs.aws-region }}
@@ -163,12 +162,13 @@ jobs:
163162
--service-name ${{ env.TERRAFORM_LAMBDA_FUNCTION_NAME }}
164163
--rollup'
165164

166-
- name: Validate generated metrics
167-
id: metric-validation
168-
if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
165+
# Validation for pulse telemetry data
166+
- name: Validate generated EMF logs
167+
id: log-validation
168+
if: (success() || steps.trace-validation.outcome == 'failure') && !cancelled()
169169
# will be removed after data quality bug fixed
170170
continue-on-error: true
171-
run: ./gradlew validator:run --args='-c node/lambda/metric-validation.yml
171+
run: ./gradlew validator:run --args='-c node/lambda/log-validation.yml
172172
--testing-id ${{ env.TESTING_ID }}
173173
--endpoint http://${{ env.API_GATEWAY_URL }}
174174
--region ${{ inputs.aws-region }}
@@ -178,12 +178,12 @@ jobs:
178178
--service-name ${{ env.TERRAFORM_LAMBDA_FUNCTION_NAME }}
179179
--rollup'
180180

181-
- name: Validate generated traces
182-
id: trace-validation
183-
if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
181+
- name: Validate generated metrics
182+
id: metric-validation
183+
if: (success() || steps.trace-validation.outcome == 'failure' || steps.log-validation.outcome == 'failure') && !cancelled()
184184
# will be removed after data quality bug fixed
185185
continue-on-error: true
186-
run: ./gradlew validator:run --args='-c node/lambda/trace-validation.yml
186+
run: ./gradlew validator:run --args='-c node/lambda/metric-validation.yml
187187
--testing-id ${{ env.TESTING_ID }}
188188
--endpoint http://${{ env.API_GATEWAY_URL }}
189189
--region ${{ inputs.aws-region }}

.github/workflows/python-lambda-test.yml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,11 @@ jobs:
174174
max_retry: 3
175175
sleep_time: 60
176176

177-
# Validation for pulse telemetry data
178-
- name: Validate generated EMF logs
179-
id: log-validation
177+
- name: Validate generated traces
178+
id: trace-validation
180179
# will be removed after data quality bug fixed
181180
continue-on-error: true
182-
run: ./gradlew validator:run --args='-c python/lambda/log-validation.yml
181+
run: ./gradlew validator:run --args='-c python/lambda/trace-validation.yml
183182
--testing-id ${{ env.TESTING_ID }}
184183
--endpoint http://${{ env.API_GATEWAY_URL }}
185184
--region ${{ inputs.aws-region }}
@@ -189,12 +188,13 @@ jobs:
189188
--service-name ${{ env.TERRAFORM_LAMBDA_FUNCTION_NAME }}
190189
--rollup'
191190

192-
- name: Validate generated metrics
193-
id: metric-validation
191+
# Validation for pulse telemetry data
192+
- name: Validate generated EMF logs
193+
id: log-validation
194+
if: (success() || steps.trace-validation.outcome == 'failure') && !cancelled()
194195
# will be removed after data quality bug fixed
195196
continue-on-error: true
196-
if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
197-
run: ./gradlew validator:run --args='-c python/lambda/metric-validation.yml
197+
run: ./gradlew validator:run --args='-c python/lambda/log-validation.yml
198198
--testing-id ${{ env.TESTING_ID }}
199199
--endpoint http://${{ env.API_GATEWAY_URL }}
200200
--region ${{ inputs.aws-region }}
@@ -204,12 +204,12 @@ jobs:
204204
--service-name ${{ env.TERRAFORM_LAMBDA_FUNCTION_NAME }}
205205
--rollup'
206206

207-
- name: Validate generated traces
208-
id: trace-validation
207+
- name: Validate generated metrics
208+
id: metric-validation
209209
# will be removed after data quality bug fixed
210210
continue-on-error: true
211-
if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
212-
run: ./gradlew validator:run --args='-c python/lambda/trace-validation.yml
211+
if: (success() || steps.trace-validation.outcome == 'failure' || steps.log-validation.outcome == 'failure') && !cancelled()
212+
run: ./gradlew validator:run --args='-c python/lambda/metric-validation.yml
213213
--testing-id ${{ env.TESTING_ID }}
214214
--endpoint http://${{ env.API_GATEWAY_URL }}
215215
--region ${{ inputs.aws-region }}
@@ -246,4 +246,4 @@ jobs:
246246
-var="function_name=${{env.TERRAFORM_LAMBDA_FUNCTION_NAME}}" \
247247
-var="layer_artifacts_directory=${{ env.ARTIFACTS_DIR }}" \
248248
-var="region=${{ env.E2E_TEST_AWS_REGION }}" \
249-
-var="is_canary=${{ env.IS_CANARY }}"
249+
-var="is_canary=${{ env.IS_CANARY }}"

validator/src/main/java/com/amazon/aoc/enums/GenericConstants.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
@Getter
2121
public enum GenericConstants {
2222
// retry
23-
SLEEP_IN_MILLISECONDS("30000"), // ms
23+
SLEEP_IN_MILLISECONDS("10000"), // ms
2424
SLEEP_IN_SECONDS("30"),
2525
MAX_RETRIES("10"),
2626

validator/src/main/java/com/amazon/aoc/services/XRayService.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,11 @@
2525
import com.amazonaws.services.xray.model.TraceSummary;
2626
import java.util.Date;
2727
import java.util.List;
28-
import lombok.extern.log4j.Log4j2;
2928
import org.joda.time.DateTime;
3029

31-
@Log4j2
3230
public class XRayService {
3331
private AWSXRay awsxRay;
34-
private final int SEARCH_PERIOD = 600 * 3;
32+
private final int SEARCH_PERIOD = 600;
3533
public static String DEFAULT_TRACE_ID = "1-00000000-000000000000000000000000";
3634

3735
public XRayService(String region) {
@@ -55,7 +53,6 @@ public List<Trace> listTraceByIds(List<String> traceIdList) {
5553
public List<TraceSummary> searchTraces(String traceFilter) {
5654
Date currentDate = new Date();
5755
Date pastDate = new DateTime(currentDate).minusSeconds(SEARCH_PERIOD).toDate();
58-
log.info("--start-time: " + pastDate + ", --end-time: " + currentDate + ", traceFilter: " + traceFilter);
5956
GetTraceSummariesResult traceSummaryResult =
6057
awsxRay.getTraceSummaries(
6158
new GetTraceSummariesRequest()

0 commit comments

Comments
 (0)