99import datadog .trace .api .gateway .RequestContext ;
1010import datadog .trace .api .gateway .RequestContextSlot ;
1111import datadog .trace .api .iast .IastContext ;
12+ import datadog .trace .api .telemetry .LogCollector ;
1213import datadog .trace .bootstrap .instrumentation .api .AgentSpan ;
1314import datadog .trace .bootstrap .instrumentation .api .AgentTracer ;
1415import datadog .trace .util .AgentTaskScheduler ;
@@ -127,6 +128,8 @@ private int getAvailableQuote(@Nullable final AgentSpan span) {
127128
128129 class OverheadControllerImpl implements OverheadController {
129130
131+ private static final Logger LOGGER = LoggerFactory .getLogger (OverheadControllerImpl .class );
132+
130133 private static final int RESET_PERIOD_SECONDS = 30 ;
131134
132135 private final int sampling ;
@@ -141,6 +144,8 @@ class OverheadControllerImpl implements OverheadController {
141144
142145 final AtomicLong cumulativeCounter ;
143146
147+ private volatile long lastAcquiredTimestamp = Long .MAX_VALUE ;
148+
144149 final OverheadContext globalContext =
145150 new OverheadContext (Config .get ().getIastVulnerabilitiesPerRequest ());
146151
@@ -165,7 +170,11 @@ public boolean acquireRequest() {
165170 long newValue = prevValue + sampling ;
166171 if (newValue / 100 == prevValue / 100 + 1 ) {
167172 // Sample request
168- return availableRequests .acquire ();
173+ final boolean acquired = availableRequests .acquire ();
174+ if (acquired ) {
175+ lastAcquiredTimestamp = System .currentTimeMillis ();
176+ }
177+ return acquired ;
169178 }
170179 // Skipped by sampling
171180 return false ;
@@ -222,11 +231,22 @@ static NonBlockingSemaphore maxConcurrentRequests(final int max) {
222231 @ Override
223232 public void reset () {
224233 globalContext .reset ();
225- // Periodic reset of maximum concurrent requests. This guards us against exhausting concurrent
226- // requests if some bug led us to lose a request end event. This will lead to periodically
227- // going above the max concurrent requests. But overall, it should be self-stabilizing. So for
228- // practical purposes, the max concurrent requests is a hint.
229- availableRequests .reset ();
234+ if (lastAcquiredTimestamp != Long .MAX_VALUE
235+ && System .currentTimeMillis () - lastAcquiredTimestamp > 1000 * 60 * 60 ) {
236+ // If the last time a request was acquired is longer than 1h, we check the number of
237+ // available requests. If it
238+ // is zero, we might have lost request end events, leading to IAST not being able to acquire
239+ // new requests.
240+ // We report it to telemetry for further investigation.
241+ if (availableRequests .available () == 0 ) {
242+ LOGGER .debug (
243+ LogCollector .SEND_TELEMETRY ,
244+ "IAST cannot acquire new requests, end of request events might be missing." );
245+ // Once starved, do not report this again, unless this is recovered and then starved
246+ // again.
247+ lastAcquiredTimestamp = Long .MAX_VALUE ;
248+ }
249+ }
230250 }
231251 }
232252}
0 commit comments