Improve concurrency design of EnterpriseGeoIpDownloader

nielsbauman · nielsbauman · commit ef12fe272057 · 2025-09-05T10:48:59.000-03:00
Refactors `EnterpriseGeoIpDownloader` to avoid race conditions between the periodic and ad-hoc runs. See the discussion on #126124 for more details on the previously existing race condition. With this new approach, we make a distinction between the periodic and ad-hoc runs. The periodic runs simply run periodically on the configured poll interval. The ad-hoc runs are typically triggered by changes in the cluster state to the GeoIP metadata, and require running the downloader immediately, to download any GeoIP databases that were just added by a user. By using a `Semaphore` and an `AtomicReference<ClusterState>`, we can guarantee that a new cluster state will result in the downloader running and avoid the downloader from running concurrently. While the (non-enterprise) `GeoIpDownloader` has the exact same concurrency implementation, we scope this PR to just the enterprise downloader to focus discussions on the design changes. A follow-up PR will modify the `GeoIpDownloader` to have the same implementation as the enterprise downloader. Fixes #126124
diff --git a/modules/ingest-geoip/src/internalClusterTest/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderIT.java b/modules/ingest-geoip/src/internalClusterTest/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderIT.java
@@ -98,10 +98,7 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
     }
 
     @SuppressWarnings("unchecked")
-    @TestLogging(
-        reason = "understanding why ipinfo asn database sometimes is not loaded",
-        value = "org.elasticsearch.ingest.geoip.DatabaseNodeService:TRACE"
-    )
+    @TestLogging(reason = "understanding why ipinfo asn database sometimes is not loaded", value = "org.elasticsearch.ingest.geoip:TRACE")
     public void testEnterpriseDownloaderTask() throws Exception {
         /*
          * This test starts the enterprise geoip downloader task, and creates a database configuration. Then it creates an ingest
diff --git a/modules/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloader.java b/modules/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloader.java
@@ -18,6 +18,7 @@
 import org.elasticsearch.action.index.IndexRequest;
 import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.client.internal.Client;
+import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.metadata.ProjectId;
 import org.elasticsearch.cluster.service.ClusterService;
@@ -55,6 +56,8 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Function;
 import java.util.function.Supplier;
 import java.util.regex.Pattern;
@@ -108,7 +111,23 @@ public class EnterpriseGeoIpDownloader extends AllocatedPersistentTask {
 
     // visible for testing
     protected volatile EnterpriseGeoIpTaskState state;
+    /**
+     * The currently scheduled periodic run, or null if no periodic run is currently scheduled. Note: _not_ the currently running thread!
+     */
     private volatile Scheduler.ScheduledCancellable scheduled;
+    /**
+     * Semaphore with 1 permit, used to ensure that only one run (periodic or cluster state) is running at a time.
+     */
+    private final Semaphore running = new Semaphore(1);
+    /**
+     * Contains a reference to the next state to run on, or null if no run is currently requested.
+     * May be overridden by a newer state before the downloader has had a chance to run on it.
+     * We store the cluster state like this instead of using `ClusterService#state()`, as we invoke {@link #requestRunOnState(ClusterState)}
+     * from a cluster state listener, and then use the cluster state asynchronously, meaning there's a race condition between
+     * {@link #runOnState()} and the rest of the cluster state listeners completing and `ClusterStateApplierService` updating its internal
+     * `state` field.
+     */
+    private final AtomicReference<ClusterState> queue = new AtomicReference<>();
     private final Supplier<TimeValue> pollIntervalSupplier;
     private final Function<String, char[]> tokenProvider;
 
@@ -146,10 +165,9 @@ void setState(EnterpriseGeoIpTaskState state) {
     }
 
     // visible for testing
-    void updateDatabases() throws IOException {
+    void updateDatabases(ClusterState clusterState) throws IOException {
         @NotMultiProjectCapable(description = "Enterprise GeoIP not available in serverless")
         ProjectId projectId = ProjectId.DEFAULT;
-        var clusterState = clusterService.state();
         var geoipIndex = clusterState.getMetadata().getProject(projectId).getIndicesLookup().get(EnterpriseGeoIpDownloader.DATABASES_INDEX);
         if (geoipIndex != null) {
             logger.trace("the geoip index [{}] exists", EnterpriseGeoIpDownloader.DATABASES_INDEX);
@@ -390,58 +408,123 @@ static byte[] getChunk(InputStream is) throws IOException {
     }
 
     /**
-     * Downloads the geoip databases now, and schedules them to be downloaded again after pollInterval.
+     * Cancels the currently scheduled run (if any) and schedules a new (periodic) run to happen immediately, which will then schedule
+     * the next periodic run using the poll interval.
      */
-    synchronized void runDownloader() {
-        // by the time we reach here, the state will never be null
-        assert this.state != null : "this.setState() is null. You need to call setState() before calling runDownloader()";
-
-        // there's a race condition between here and requestReschedule. originally this scheduleNextRun call was at the end of this
-        // block, but remember that updateDatabases can take seconds to run (it's downloading bytes from the internet), and so during the
-        // very first run there would be no future run scheduled to reschedule in requestReschedule. which meant that if you went from zero
-        // to N(>=2) databases in quick succession, then all but the first database wouldn't necessarily get downloaded, because the
-        // requestReschedule call in the EnterpriseGeoIpDownloaderTaskExecutor's clusterChanged wouldn't have a scheduled future run to
-        // reschedule. scheduling the next run at the beginning of this run means that there's a much smaller window (milliseconds?, rather
-        // than seconds) in which such a race could occur. technically there's a window here, still, but i think it's _greatly_ reduced.
-        scheduleNextRun(pollIntervalSupplier.get());
-        // TODO regardless of the above comment, i like the idea of checking the lowest last-checked time and then running the math to get
-        // to the next interval from then -- maybe that's a neat future enhancement to add
+    public void restartPeriodicRun() {
+        logger.trace("Restarting periodic run");
+        if (scheduled != null) {
+            final boolean cancelSuccessful = scheduled.cancel();
+            logger.trace("Cancelled scheduled run: [{}]", cancelSuccessful);
+        }
+        if (threadPool.scheduler().isShutdown() == false) {
+            threadPool.schedule(this::runPeriodic, TimeValue.ZERO, threadPool.generic());
+        }
+    }
 
+    /**
+     * Tries to run the downloader now, if it isn't already currently running, and schedules the next periodic run using the poll interval.
+     */
+    private void runPeriodic() {
         if (isCancelled() || isCompleted()) {
+            logger.debug("Not running periodic downloader because task is cancelled or completed");
             return;
         }
-        try {
-            updateDatabases(); // n.b. this downloads bytes from the internet, it can take a while
-        } catch (Exception e) {
-            logger.error("exception during databases update", e);
+
+        // If we are not able to acquire the semaphore immediately, it means that a run is already in progress. Periodic runs do not run
+        // concurrently, but a cluster state run could be in progress. Since the default poll interval is quite large (3d), there is no
+        // need to wait for the current run to finish and then run again, so we just skip this run and schedule the next one.
+        if (running.tryAcquire()) {
+            final var clusterState = clusterService.state();
+            logger.trace("Running periodic downloader on cluster state [{}]", clusterState.version());
+            runDownloader(clusterState);
+            running.release();
         }
-        try {
-            cleanDatabases();
-        } catch (Exception e) {
-            logger.error("exception during databases cleanup", e);
+        if (threadPool.scheduler().isShutdown() == false) {
+            logger.trace("Scheduling next periodic run, current scheduled run is [{}]", scheduled);
+            scheduled = threadPool.schedule(this::runPeriodic, pollIntervalSupplier.get(), threadPool.generic());
+            logger.trace("Next periodic run scheduled: [{}]", scheduled);
         }
     }
 
     /**
-     * This method requests that the downloader be rescheduled to run immediately (presumably because a dynamic property supplied by
-     * pollIntervalSupplier or eagerDownloadSupplier has changed, or a pipeline with a geoip processor has been added). This method does
-     * nothing if this task is cancelled, completed, or has not yet been scheduled to run for the first time. It cancels any existing
-     * scheduled run.
+     * This method requests that the downloader runs on the supplied cluster state, which likely contains a change in the GeoIP metadata.
+     * If the queue was non-empty before we set it, then a run is already scheduled or in progress, so it will either be processed in the
+     * next/current run, or the current run will automatically start a new run when it finishes because the cluster state queue changed
+     * while it was running. This method does nothing if this task is cancelled or completed.
      */
-    public void requestReschedule() {
+    public void requestRunOnState(ClusterState clusterState) {
+        if (isCancelled() || isCompleted() || threadPool.scheduler().isShutdown()) {
+            logger.debug("Not requesting downloader run on cluster state because task is cancelled, completed or shutting down");
+            return;
+        }
+        logger.trace("Requesting downloader run on cluster state [{}]", clusterState.version());
+        if (queue.getAndSet(clusterState) == null) {
+            logger.trace("Scheduling downloader run on cluster state");
+            threadPool.schedule(this::runOnState, TimeValue.ZERO, threadPool.generic());
+        }
+    }
+
+    /**
+     * Waits for any current run to finish, then runs the downloader on the last seen cluster state. If a new cluster state came in while
+     * waiting or running, then schedules another run to happen immediately after this one.
+     */
+    private void runOnState() {
+        if (isCancelled() || isCompleted()) {
+            logger.debug("Not running downloader on cluster state because task is cancelled or completed");
+            return;
+        }
+        // Here we do want to wait for the current run (if any) to finish. Since a new cluster state might have arrived while the current
+        // run was running, we want to ensure that new cluster state update isn't lost, so we wait and run afterwards.
+        logger.trace("Waiting to run downloader on cluster state");
+        try {
+            running.acquire();
+        } catch (InterruptedException e) {
+            logger.warn("Interrupted while waiting to run downloader on cluster state", e);
+        }
+        // Get the last seen cluster state and process it.
+        final ClusterState clusterState = queue.get();
+        assert clusterState != null : "queue was null, but we should only be called if queue was non-null";
+        logger.debug("Running downloader on cluster state [{}]", clusterState.version());
+        runDownloader(clusterState);
+        // Try to clear the queue by setting the reference to null. If another cluster state came in since we fetched it above (i.e. the
+        // reference differs from `clusterState`), then we schedule another run to happen immediately after this one.
+        if (queue.compareAndSet(clusterState, null) == false) {
+            logger.debug("A new cluster state came in while running, scheduling another run");
+            threadPool.schedule(this::runOnState, TimeValue.ZERO, threadPool.generic());
+        }
+        // We release the semaphore last, to ensure that no duplicate runs/threads are started.
+        running.release();
+        logger.trace("Finished running downloader on cluster state [{}]", clusterState.version());
+    }
+
+    /**
+     * Downloads the geoip databases now based on the supplied cluster state.
+     */
+    synchronized void runDownloader(ClusterState clusterState) {
+        // by the time we reach here, the state will never be null
+        assert this.state != null : "this.setState() is null. You need to call setState() before calling runDownloader()";
+
         if (isCancelled() || isCompleted()) {
             return;
         }
-        if (scheduled != null && scheduled.cancel()) {
-            scheduleNextRun(TimeValue.ZERO);
+        try {
+            updateDatabases(clusterState); // n.b. this downloads bytes from the internet, it can take a while
+        } catch (Exception e) {
+            logger.error("exception during databases update", e);
+        }
+        try {
+            cleanDatabases(clusterState);
+        } catch (Exception e) {
+            logger.error("exception during databases cleanup", e);
         }
     }
 
-    private void cleanDatabases() {
+    private void cleanDatabases(ClusterState clusterState) {
         List<Tuple<String, Metadata>> expiredDatabases = state.getDatabases()
             .entrySet()
             .stream()
-            .filter(e -> e.getValue().isNewEnough(clusterService.state().metadata().settings()) == false)
+            .filter(e -> e.getValue().isNewEnough(clusterState.metadata().settings()) == false)
             .map(entry -> Tuple.tuple(entry.getKey(), entry.getValue()))
             .toList();
         expiredDatabases.forEach(e -> {
@@ -461,12 +544,6 @@ protected void onCancelled() {
         markAsCompleted();
     }
 
-    private void scheduleNextRun(TimeValue time) {
-        if (threadPool.scheduler().isShutdown() == false) {
-            scheduled = threadPool.schedule(this::runDownloader, time, threadPool.generic());
-        }
-    }
-
     private ProviderDownload downloaderFor(DatabaseConfiguration database) {
         if (database.provider() instanceof DatabaseConfiguration.Maxmind maxmind) {
             return new MaxmindDownload(database.name(), maxmind);
diff --git a/modules/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTaskExecutor.java b/modules/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTaskExecutor.java
@@ -103,7 +103,7 @@ private void setPollInterval(TimeValue pollInterval) {
             this.pollInterval = pollInterval;
             EnterpriseGeoIpDownloader currentDownloader = getCurrentTask();
             if (currentDownloader != null) {
-                currentDownloader.requestReschedule();
+                currentDownloader.restartPeriodicRun();
             }
         }
     }
@@ -154,7 +154,7 @@ protected void nodeOperation(AllocatedPersistentTask task, EnterpriseGeoIpTaskPa
         downloader.setState(geoIpTaskState);
         currentTask.set(downloader);
         if (ENABLED_SETTING.get(clusterService.state().metadata().settings(), settings)) {
-            downloader.runDownloader();
+            downloader.restartPeriodicRun();
         }
     }
 
@@ -169,7 +169,8 @@ public void clusterChanged(ClusterChangedEvent event) {
             boolean hasGeoIpMetadataChanges = event.metadataChanged()
                 && event.changedCustomProjectMetadataSet().contains(IngestGeoIpMetadata.TYPE);
             if (hasGeoIpMetadataChanges) {
-                currentDownloader.requestReschedule(); // watching the cluster changed events to kick the thing off if it's not running
+                // watching the cluster changed events to kick the thing off if it's not running
+                currentDownloader.requestRunOnState(event.state());
             }
         }
     }
diff --git a/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java b/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java
@@ -451,11 +451,10 @@ public void testUpdateDatabasesWriteBlock() {
             .get(EnterpriseGeoIpDownloader.DATABASES_INDEX)
             .getWriteIndex()
             .getName();
-        state = ClusterState.builder(state)
+        ClusterState finalState = ClusterState.builder(state)
             .blocks(new ClusterBlocks.Builder().addIndexBlock(projectId, geoIpIndex, IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK))
             .build();
-        when(clusterService.state()).thenReturn(state);
-        var e = expectThrows(ClusterBlockException.class, () -> geoIpDownloader.updateDatabases());
+        var e = expectThrows(ClusterBlockException.class, () -> geoIpDownloader.updateDatabases(finalState));
         assertThat(
             e.getMessage(),
             equalTo(
@@ -481,8 +480,7 @@ public void testUpdateDatabasesIndexNotReady() throws IOException {
         state = ClusterState.builder(state)
             .blocks(new ClusterBlocks.Builder().addIndexBlock(projectId, geoIpIndex, IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK))
             .build();
-        when(clusterService.state()).thenReturn(state);
-        geoIpDownloader.updateDatabases();
+        geoIpDownloader.updateDatabases(state);
         verifyNoInteractions(httpClient);
     }
 
diff --git a/muted-tests.yml b/muted-tests.yml
@@ -230,9 +230,6 @@ tests:
 - class: org.elasticsearch.smoketest.MlWithSecurityIT
   method: test {yaml=ml/trained_model_cat_apis/Test cat trained models}
   issue: https://github.com/elastic/elasticsearch/issues/125750
-- class: org.elasticsearch.ingest.geoip.EnterpriseGeoIpDownloaderIT
-  method: testEnterpriseDownloaderTask
-  issue: https://github.com/elastic/elasticsearch/issues/126124
 - class: org.elasticsearch.xpack.test.rest.XPackRestIT
   method: test {p0=transform/transforms_start_stop/Test start/stop only starts/stops specified transform}
   issue: https://github.com/elastic/elasticsearch/issues/126466

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ private void setPollInterval(TimeValue pollInterval) {`
`103`	`103`	`this.pollInterval = pollInterval;`
`104`	`104`	`EnterpriseGeoIpDownloader currentDownloader = getCurrentTask();`
`105`	`105`	`if (currentDownloader != null) {`
`106`		`- currentDownloader.requestReschedule();`
	`106`	`+ currentDownloader.restartPeriodicRun();`
`107`	`107`	`}`
`108`	`108`	`}`
`109`	`109`	`}`
`@@ -154,7 +154,7 @@ protected void nodeOperation(AllocatedPersistentTask task, EnterpriseGeoIpTaskPa`
`154`	`154`	`downloader.setState(geoIpTaskState);`
`155`	`155`	`currentTask.set(downloader);`
`156`	`156`	`if (ENABLED_SETTING.get(clusterService.state().metadata().settings(), settings)) {`
`157`		`- downloader.runDownloader();`
	`157`	`+ downloader.restartPeriodicRun();`
`158`	`158`	`}`
`159`	`159`	`}`
`160`	`160`
`@@ -169,7 +169,8 @@ public void clusterChanged(ClusterChangedEvent event) {`
`169`	`169`	`boolean hasGeoIpMetadataChanges = event.metadataChanged()`
`170`	`170`	`&& event.changedCustomProjectMetadataSet().contains(IngestGeoIpMetadata.TYPE);`
`171`	`171`	`if (hasGeoIpMetadataChanges) {`
`172`		`- currentDownloader.requestReschedule(); // watching the cluster changed events to kick the thing off if it's not running`
	`172`	`+ // watching the cluster changed events to kick the thing off if it's not running`
	`173`	`+ currentDownloader.requestRunOnState(event.state());`
`173`	`174`	`}`
`174`	`175`	`}`
`175`	`176`	`}`