Skip to content

Commit f1c0dac

Browse files
Implemented backoff for backend communication (#999)
* Implemented backoff for backend communication * Added unit tests for backend communication backoff * Added component test * Added diagnostics to production code part * Added comments to component test * Clarified log message * Removed isInFailedMode
1 parent f8c1d58 commit f1c0dac

33 files changed

+1072
-127
lines changed

src/ElasticApm/Impl/Clock.php

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,7 @@ final class Clock implements ClockInterface
4949

5050
public function __construct(LoggerFactory $loggerFactory)
5151
{
52-
$this->logger = $loggerFactory->loggerForClass(
53-
LogCategory::INFRASTRUCTURE,
54-
__NAMESPACE__,
55-
__CLASS__,
56-
__FILE__
57-
)->addContext('this', $this);
52+
$this->logger = $loggerFactory->loggerForClass(LogCategory::INFRASTRUCTURE, __NAMESPACE__, __CLASS__, __FILE__)->addContext('this', $this);
5853

5954
$this->hasMonotonicTimeSource = function_exists('hrtime');
6055
}

src/ElasticApm/Impl/ExecutionSegment.php

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,7 @@ protected function __construct(
149149
]
150150
);
151151
}
152-
$this->diffStartTimeWithSystemClockOnBeginInMicroseconds
153-
= TimeUtil::calcDurationInMicrosecondsClampNegativeToZero($this->timestamp, $systemClockNow);
152+
$this->diffStartTimeWithSystemClockOnBeginInMicroseconds = TimeUtil::calcDurationInMicrosecondsClampNegativeToZero($this->timestamp, $systemClockNow);
154153
$this->monotonicBeginTime = $monotonicClockNow;
155154
$this->traceId = $traceId;
156155
$this->setName($name);

src/ext/backend_comm.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "util.h"
3131
#include "util_for_PHP.h"
3232
#include "basic_macros.h"
33+
#include "backend_comm_backoff.h"
3334

3435
#define ELASTIC_APM_CURRENT_LOG_CATEGORY ELASTIC_APM_LOG_CATEGORY_BACKEND_COMM
3536

@@ -190,9 +191,10 @@ struct ConnectionData
190191
{
191192
CURL* curlHandle;
192193
struct curl_slist* requestHeaders;
194+
BackendCommBackoff backoff;
193195
};
194196
typedef struct ConnectionData ConnectionData;
195-
ConnectionData g_connectionData = { .curlHandle = NULL, .requestHeaders = NULL };
197+
ConnectionData g_connectionData = { .curlHandle = NULL, .requestHeaders = NULL, .backoff = ELASTIC_APM_DEFAULT_BACKEND_COMM_BACKOFF };
196198

197199
void cleanupConnectionData( ConnectionData* connectionData )
198200
{
@@ -465,11 +467,16 @@ ResultCode syncSendEventsToApmServerWithConn( const ConfigSnapshot* config, Conn
465467
ELASTIC_APM_SET_RESULT_CODE_AND_GOTO_FAILURE();
466468
}
467469

468-
long responseCode;
470+
long responseCode = 0;
469471
curl_easy_getinfo( connectionData->curlHandle, CURLINFO_RESPONSE_CODE, &responseCode );
470-
ELASTIC_APM_LOG_DEBUG( "Sent events to APM Server. Response HTTP code: %ld. URL: `%s'.", responseCode, url );
471-
472-
resultCode = resultSuccess;
472+
/**
473+
* If the HTTP response status code isn’t 2xx or if a request is prematurely closed (either on the TCP or HTTP level) the request MUST be considered failed.
474+
*
475+
* @see https://github.com/elastic/apm/blob/d8cb5607dbfffea819ab5efc9b0743044772fb23/specs/agents/transport.md#transport-errors
476+
*/
477+
bool isFailed = ( responseCode / 100 ) != 2;
478+
ELASTIC_APM_LOG_WITH_LEVEL( isFailed ? logLevel_error : logLevel_debug, "Sent events to APM Server. Response HTTP code: %ld. URL: `%s'.", responseCode, url );
479+
resultCode = isFailed ? resultFailure : resultSuccess;
473480
finally:
474481
ELASTIC_APM_LOG_DEBUG_RESULT_CODE_FUNCTION_EXIT();
475482
return resultCode;
@@ -505,19 +512,27 @@ ResultCode syncSendEventsToApmServer( const ConfigSnapshot* config, StringView u
505512
ELASTIC_APM_SET_RESULT_CODE_TO_SUCCESS_AND_GOTO_FINALLY();
506513
}
507514

515+
if ( backendCommBackoff_shouldWait( &connectionData->backoff ) )
516+
{
517+
ELASTIC_APM_LOG_DEBUG( "Backoff wait time has not elapsed yet - discarding events instead of sending" );
518+
ELASTIC_APM_SET_RESULT_CODE_TO_SUCCESS_AND_GOTO_FINALLY();
519+
}
520+
508521
if ( connectionData->curlHandle == NULL )
509522
{
510523
ELASTIC_APM_CALL_IF_FAILED_GOTO( initConnectionData( config, connectionData, userAgentHttpHeader ) );
511524
}
512525

513526
ELASTIC_APM_CALL_IF_FAILED_GOTO( syncSendEventsToApmServerWithConn( config, connectionData, serializedEvents ) );
527+
backendCommBackoff_onSuccess( &connectionData->backoff );
514528

515529
resultCode = resultSuccess;
516530
finally:
517531
ELASTIC_APM_LOG_DEBUG_RESULT_CODE_FUNCTION_EXIT();
518532
return resultCode;
519533

520534
failure:
535+
backendCommBackoff_onError( &connectionData->backoff );
521536
cleanupConnectionData( connectionData );
522537
goto finally;
523538
}

src/ext/backend_comm_backoff.c

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "backend_comm_backoff.h"
21+
#include <stdlib.h>
22+
#include <math.h>
23+
#include "basic_macros.h"
24+
#include "log.h"
25+
#include "time_util.h"
26+
27+
#define ELASTIC_APM_CURRENT_LOG_CATEGORY ELASTIC_APM_LOG_CATEGORY_BACKEND_COMM
28+
29+
/**
30+
* Algorithm is based on Elastic APM agent spec's "Transport errors" section
31+
*
32+
* @see https://github.com/elastic/apm/blob/d8cb5607dbfffea819ab5efc9b0743044772fb23/specs/agents/transport.md#transport-errors
33+
*/
34+
35+
void backendCommBackoff_onSuccess( BackendCommBackoff* thisObj )
36+
{
37+
ELASTIC_APM_ASSERT_VALID_PTR( thisObj );
38+
39+
thisObj->errorCount = 0;
40+
thisObj->waitEndTime = (TimeSpec){ 0 };
41+
}
42+
43+
bool backendCommBackoff_getCurrentTime( BackendCommBackoff* thisObj, /* out */ TimeSpec* currentTime )
44+
{
45+
ResultCode resultCode;
46+
ELASTIC_APM_CALL_IF_FAILED_GOTO( getClockTimeSpec( /* isRealTime */ false, /* out */ currentTime ) );
47+
48+
resultCode = resultSuccess;
49+
finally:
50+
ELASTIC_APM_LOG_DEBUG_RESULT_CODE_FUNCTION_EXIT();
51+
return resultCode == resultSuccess;
52+
53+
failure:
54+
ELASTIC_APM_LOG_ERROR( "Failed to get current time" );
55+
goto finally;
56+
}
57+
58+
void backendCommBackoff_onError( BackendCommBackoff* thisObj )
59+
{
60+
ELASTIC_APM_ASSERT_VALID_PTR( thisObj );
61+
62+
ELASTIC_APM_LOG_DEBUG_FUNCTION_ENTRY();
63+
64+
/**
65+
* The grace period should be calculated in seconds using the algorithm min(reconnectCount++, 6) ** 2 ± 10%
66+
*
67+
* @see https://github.com/elastic/apm/blob/d8cb5607dbfffea819ab5efc9b0743044772fb23/specs/agents/transport.md#transport-errors
68+
*/
69+
enum { maxSequentialErrorsCount = 7 };
70+
71+
if ( thisObj->errorCount < maxSequentialErrorsCount )
72+
{
73+
++thisObj->errorCount;
74+
}
75+
76+
if ( ! backendCommBackoff_getCurrentTime( thisObj, /* out */ &thisObj->waitEndTime ) )
77+
{
78+
// If we cannot get current time we just reset the state to that of no errors
79+
backendCommBackoff_onSuccess( thisObj );
80+
return;
81+
}
82+
addDelayToAbsTimeSpec( /* in, out */ &thisObj->waitEndTime, /* delayInNanoseconds */ (long)backendCommBackoff_getTimeToWaitInSeconds( thisObj ) * ELASTIC_APM_NUMBER_OF_NANOSECONDS_IN_SECOND );
83+
}
84+
85+
int backendCommBackoff_convertRandomUIntToJitter( UInt randomVal, UInt jitterHalfRange )
86+
{
87+
double diff = randomVal - ( RAND_MAX / 2.0 );
88+
return ( diff >= 0 ? 1 : -1 ) * ( (int) floor( ( jitterHalfRange * fabs( diff ) ) / ( RAND_MAX / 2.0 ) ) );
89+
}
90+
91+
UInt backendCommBackoff_getTimeToWaitInSeconds( const BackendCommBackoff* thisObj )
92+
{
93+
ELASTIC_APM_ASSERT_VALID_PTR( thisObj );
94+
95+
if ( thisObj->errorCount == 0 )
96+
{
97+
return 0;
98+
}
99+
100+
UInt reconnectCount = ( thisObj->errorCount - 1);
101+
double timeToWaitWithoutJitter = pow( reconnectCount, 2 );
102+
double jitterHalfRange = timeToWaitWithoutJitter * 0.1;
103+
UInt jitter = jitterHalfRange < 1 ? 0 : backendCommBackoff_convertRandomUIntToJitter( thisObj->generateRandomUInt( thisObj->generateRandomUIntCtx ), (UInt) floor( jitterHalfRange ) );
104+
105+
return (int)( round( timeToWaitWithoutJitter ) ) + jitter;
106+
}
107+
108+
#pragma clang diagnostic push
109+
#pragma ide diagnostic ignored "UnusedParameter"
110+
UInt backendCommBackoff_defaultGenerateRandomUInt( void* ctx )
111+
#pragma clang diagnostic pop
112+
{
113+
return (UInt) rand(); // NOLINT(cert-msc50-cpp)
114+
}
115+
116+
bool backendCommBackoff_shouldWait( BackendCommBackoff* thisObj )
117+
{
118+
if ( thisObj->errorCount == 0 )
119+
{
120+
return false;
121+
}
122+
123+
TimeSpec currentTime;
124+
if ( ! backendCommBackoff_getCurrentTime( thisObj, /* out */ &currentTime ) )
125+
{
126+
// If we cannot get current time we just reset the state to that of no errors
127+
backendCommBackoff_onSuccess( thisObj );
128+
return false;
129+
}
130+
131+
if ( compareAbsTimeSpecs( &thisObj->waitEndTime, &currentTime ) <= 0 )
132+
{
133+
return false;
134+
}
135+
136+
char txtOutStreamBuf[ ELASTIC_APM_TEXT_OUTPUT_STREAM_ON_STACK_BUFFER_SIZE ];
137+
TextOutputStream txtOutStream = ELASTIC_APM_TEXT_OUTPUT_STREAM_FROM_STATIC_BUFFER( txtOutStreamBuf );
138+
ELASTIC_APM_LOG_TRACE( "Left to wait: %s, errorCount: %u", streamTimeSpecDiff( &currentTime, &thisObj->waitEndTime, &txtOutStream ), thisObj->errorCount );
139+
return true;
140+
}

src/ext/backend_comm_backoff.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include "basic_types.h"
23+
#include "time_util.h"
24+
25+
typedef UInt (* GenerateRandomUInt )( void* ctx );
26+
27+
struct BackendCommBackoff
28+
{
29+
GenerateRandomUInt generateRandomUInt;
30+
void* generateRandomUIntCtx;
31+
UInt errorCount;
32+
TimeSpec waitEndTime;
33+
};
34+
typedef struct BackendCommBackoff BackendCommBackoff;
35+
36+
void backendCommBackoff_onSuccess( BackendCommBackoff* thisObj );
37+
void backendCommBackoff_onError( BackendCommBackoff* thisObj );
38+
bool backendCommBackoff_shouldWait( BackendCommBackoff* thisObj );
39+
40+
UInt backendCommBackoff_getTimeToWaitInSeconds( const BackendCommBackoff* thisObj );
41+
int backendCommBackoff_convertRandomUIntToJitter( UInt randomVal, UInt jitterHalfRange );
42+
UInt backendCommBackoff_defaultGenerateRandomUInt( void* ctx );
43+
44+
#define ELASTIC_APM_DEFAULT_BACKEND_COMM_BACKOFF \
45+
((BackendCommBackoff) \
46+
{ \
47+
.generateRandomUInt = &backendCommBackoff_defaultGenerateRandomUInt, \
48+
.generateRandomUIntCtx = NULL, \
49+
.errorCount = 0, \
50+
.waitEndTime = { 0 } \
51+
}) \
52+
/**/

src/ext/config.m4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ if test "$PHP_ELASTIC_APM" != "no"; then
8282
AST_debug.c \
8383
AST_instrumentation.c \
8484
backend_comm.c \
85+
backend_comm_backoff.c \
8586
ConfigManager.c \
8687
elastic_apm.c \
8788
elastic_apm_API.c \

src/ext/config.w32

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ if (PHP_ELASTIC_APM != 'no') {
1616
"AST_debug.c" + " " +
1717
"AST_instrumentation.c" + " " +
1818
"backend_comm.c" + " " +
19+
"backend_comm_backoff.c" + " " +
1920
"ConfigManager.c" + " " +
2021
"elastic_apm.c" + " " +
2122
"elastic_apm_API.c" + " " +

src/ext/log.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ static void openAndAppendToFile( Logger* logger, String text )
404404
{
405405
size_t textLen = strlen( text );
406406

407-
// TODO: Sergey Kleyman: Uncomment: Fix lower level system calls - "open" and "write" to get stronger guarantee
407+
// OLD TODO: Uncomment: Fix lower level system calls - "open" and "write" to get stronger guarantee
408408
//#ifdef PHP_WIN32
409409
FILE* file = fopen( logger->config.file, "a" );
410410
if ( file == NULL )

src/ext/time_util.c

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,14 @@ Int64 durationToMilliseconds( Duration duration )
7878
#pragma clang diagnostic push
7979
#pragma ide diagnostic ignored "ConstantFunctionResult"
8080
#endif
81-
ResultCode getCurrentAbsTimeSpec( TimeSpec* currentAbsTimeSpec )
81+
ResultCode getClockTimeSpec( bool isRealTime, /* out */ TimeSpec* currentAbsTimeSpec )
8282
{
8383
ResultCode resultCode;
8484

8585
#ifdef PHP_WIN32
8686
ELASTIC_APM_SET_RESULT_CODE_AND_GOTO_FAILURE();
8787
#else // #ifdef PHP_WIN32
88-
int clock_gettime_retVal = clock_gettime( CLOCK_REALTIME, currentAbsTimeSpec );
88+
int clock_gettime_retVal = clock_gettime( isRealTime ? CLOCK_REALTIME : CLOCK_MONOTONIC, currentAbsTimeSpec );
8989
if ( clock_gettime_retVal != 0 )
9090
{
9191
int clock_gettime_errno = errno;
@@ -110,6 +110,11 @@ ResultCode getCurrentAbsTimeSpec( TimeSpec* currentAbsTimeSpec )
110110
#pragma clang diagnostic pop
111111
#endif
112112

113+
ResultCode getCurrentAbsTimeSpec( /* out */ TimeSpec* currentAbsTimeSpec )
114+
{
115+
return getClockTimeSpec( /* isRealTime */ true, /* out */ currentAbsTimeSpec );
116+
}
117+
113118
void addDelayToAbsTimeSpec( /* in, out */ TimeSpec* absTimeSpec, long delayInNanoseconds )
114119
{
115120
ELASTIC_APM_ASSERT_VALID_PTR( absTimeSpec );
@@ -231,6 +236,31 @@ String streamUtcTimeSpecAsLocal( const TimeSpec* utcTimeSpec, TextOutputStream*
231236
return streamUtcTimeValAsLocal( &utcTimeVal, txtOutStream );
232237
}
233238

239+
String streamTimeSpecDiff( const TimeSpec* fromTimeSpec, const TimeSpec* toTimeSpec, TextOutputStream* txtOutStream )
240+
{
241+
ELASTIC_APM_ASSERT_VALID_PTR( fromTimeSpec );
242+
ELASTIC_APM_ASSERT_VALID_PTR( toTimeSpec );
243+
ELASTIC_APM_ASSERT_VALID_PTR( txtOutStream );
244+
245+
bool isDiffNegative = compareAbsTimeSpecs( fromTimeSpec, toTimeSpec ) > 0;
246+
const TimeSpec* from = isDiffNegative ? toTimeSpec : fromTimeSpec;
247+
const TimeSpec* to = isDiffNegative ? fromTimeSpec : toTimeSpec;
248+
UInt64 diffSecondsPart = to->tv_sec - from->tv_sec;
249+
Int64 diffNanosecondsPart = to->tv_nsec - from->tv_nsec;
250+
if ( diffNanosecondsPart < 0 )
251+
{
252+
--diffSecondsPart;
253+
diffNanosecondsPart += ELASTIC_APM_NUMBER_OF_NANOSECONDS_IN_SECOND;
254+
}
255+
256+
if ( diffNanosecondsPart == 0 )
257+
{
258+
return streamPrintf( txtOutStream, "%"PRIu64"s", diffSecondsPart );
259+
}
260+
261+
streamPrintf( txtOutStream, "%s%"PRIu64"s %"PRId64"ns", isDiffNegative ? "-" : "", diffSecondsPart, diffNanosecondsPart );
262+
}
263+
234264
int compareAbsTimeSpecs( const TimeSpec* a, const TimeSpec* b )
235265
{
236266
#define ELASTIC_APM_NUMCMP( a, b ) ( (a) < (b) ? -1 : ( (a) > (b) ? 1 : 0 ) )

0 commit comments

Comments
 (0)