diff --git a/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/DeleteSessionAction.java b/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/DeleteSessionAction.java
index 1e7ed69067..e77d8d09d4 100644
--- a/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/DeleteSessionAction.java
+++ b/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/DeleteSessionAction.java
@@ -46,14 +46,6 @@ protected boolean isInjectedError(Error e) {
return e instanceof InjectedFailures.TerminateException;
}
- @Override
- protected void notifyExpired() {
- }
-
- @Override
- protected void notifyFinished() {
- }
-
@Override
protected void onExpired(TransactionHandle tx) {
throw new RuntimeException("Unexpected, sessionId: '%s', op: '%s'".formatted(sessionId, id()));
diff --git a/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/MountDynamicDiskAction.java b/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/MountDynamicDiskAction.java
index 97378fd0bd..e4a7b3d16e 100644
--- a/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/MountDynamicDiskAction.java
+++ b/lzy/allocator/src/main/java/ai/lzy/allocator/alloc/MountDynamicDiskAction.java
@@ -7,7 +7,10 @@
import ai.lzy.allocator.volume.VolumeManager;
import ai.lzy.logs.LogContextKey;
import ai.lzy.longrunning.Operation;
-import ai.lzy.longrunning.OperationRunnerBase;
+import ai.lzy.longrunning.task.OpTaskAwareAction;
+import ai.lzy.longrunning.task.OperationTask;
+import ai.lzy.longrunning.task.OperationTaskScheduler;
+import ai.lzy.longrunning.task.dao.OperationTaskDao;
import ai.lzy.model.db.TransactionHandle;
import ai.lzy.v1.VmAllocatorApi;
import com.google.protobuf.Any;
@@ -17,6 +20,7 @@
import jakarta.annotation.Nullable;
import java.sql.SQLException;
+import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -25,7 +29,7 @@
import static ai.lzy.model.db.DbHelper.withRetries;
-public final class MountDynamicDiskAction extends OperationRunnerBase {
+public final class MountDynamicDiskAction extends OpTaskAwareAction {
private final AllocationContext allocationContext;
private final VolumeManager volumeManager;
private final MountHolderManager mountHolderManager;
@@ -42,9 +46,13 @@ public final class MountDynamicDiskAction extends OperationRunnerBase {
private Long nextId;
private boolean mountPodsDeleted;
- public MountDynamicDiskAction(Vm vm, DynamicMount dynamicMount, AllocationContext allocationContext) {
- super(dynamicMount.mountOperationId(), String.format("Mount %s to VM %s", dynamicMount.mountName(), vm.vmId()),
- allocationContext.storage(), allocationContext.operationsDao(), allocationContext.executor());
+ public MountDynamicDiskAction(Vm vm, DynamicMount dynamicMount, AllocationContext allocationContext,
+ OperationTask operationTask, OperationTaskDao operationTaskDao,
+ Duration leaseDuration, OperationTaskScheduler operationTaskScheduler)
+ {
+ super(operationTask, operationTaskDao, leaseDuration, dynamicMount.mountOperationId(),
+ String.format("Mount %s to VM %s", dynamicMount.mountName(), vm.vmId()), allocationContext.storage(),
+ allocationContext.operationsDao(), allocationContext.executor(), operationTaskScheduler);
this.dynamicMount = dynamicMount;
this.vm = vm;
this.allocationContext = allocationContext;
diff --git a/lzy/allocator/src/main/java/ai/lzy/allocator/task/MountDynamicDiskResolver.java b/lzy/allocator/src/main/java/ai/lzy/allocator/task/MountDynamicDiskResolver.java
new file mode 100644
index 0000000000..1aa0c02b46
--- /dev/null
+++ b/lzy/allocator/src/main/java/ai/lzy/allocator/task/MountDynamicDiskResolver.java
@@ -0,0 +1,88 @@
+package ai.lzy.allocator.task;
+
+import ai.lzy.allocator.alloc.AllocationContext;
+import ai.lzy.allocator.alloc.MountDynamicDiskAction;
+import ai.lzy.allocator.alloc.dao.DynamicMountDao;
+import ai.lzy.allocator.alloc.dao.VmDao;
+import ai.lzy.allocator.model.DynamicMount;
+import ai.lzy.allocator.model.Vm;
+import ai.lzy.longrunning.task.OperationTask;
+import ai.lzy.longrunning.task.OperationTaskScheduler;
+import ai.lzy.longrunning.task.ResolverUtils;
+import ai.lzy.longrunning.task.TypedOperationTaskResolver;
+import ai.lzy.longrunning.task.dao.OperationTaskDao;
+import ai.lzy.model.db.TransactionHandle;
+import jakarta.annotation.Nullable;
+import jakarta.inject.Singleton;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.sql.SQLException;
+import java.time.Duration;
+
+@Singleton
+public class MountDynamicDiskResolver implements TypedOperationTaskResolver {
+ private static final Logger LOG = LogManager.getLogger(MountDynamicDiskResolver.class);
+
+ private static final String TYPE = "MOUNT";
+ public static final String VM_ID_FIELD = "vm_id";
+ public static final String DYNAMIC_MOUNT_ID_FIELD = "dynamic_mount_id";
+
+ private final VmDao vmDao;
+ private final DynamicMountDao dynamicMountDao;
+ private final AllocationContext allocationContext;
+ private final OperationTaskDao operationTaskDao;
+ private final OperationTaskScheduler taskScheduler; //todo circular dependency
+ private final Duration leaseDuration;
+
+ public MountDynamicDiskResolver(VmDao vmDao, DynamicMountDao dynamicMountDao, AllocationContext allocationContext,
+ OperationTaskDao operationTaskDao, OperationTaskScheduler taskScheduler,
+ Duration leaseDuration)
+ //todo mark duration with a qualifier
+ {
+ this.vmDao = vmDao;
+ this.dynamicMountDao = dynamicMountDao;
+ this.allocationContext = allocationContext;
+ this.operationTaskDao = operationTaskDao;
+ this.taskScheduler = taskScheduler;
+ this.leaseDuration = leaseDuration;
+ }
+
+ @Override
+ public Result resolve(OperationTask opTask, @Nullable TransactionHandle tx) throws SQLException {
+ var vmId = ResolverUtils.readString(opTask.metadata(), VM_ID_FIELD);
+ if (vmId == null) {
+ LOG.error("{} field is not present in task {} metadata", VM_ID_FIELD, opTask.id());
+ return Result.BAD_STATE;
+ }
+ var dynamicMountId = ResolverUtils.readString(opTask.metadata(), DYNAMIC_MOUNT_ID_FIELD);
+ if (dynamicMountId == null) {
+ LOG.error("{} field is not present in task {} metadata", DYNAMIC_MOUNT_ID_FIELD, opTask.id());
+ return Result.BAD_STATE;
+ }
+ var vm = vmDao.get(vmId, tx);
+ if (vm == null) {
+ LOG.error("VM {} is not present for task", vmId);
+ return Result.STALE;
+ } else if (vm.status() != Vm.Status.RUNNING) {
+ LOG.error("VM {} is in wrong status: {}", vmId, vm.status());
+ return Result.STALE;
+ }
+ var dynamicMount = dynamicMountDao.get(dynamicMountId, false, tx);
+ if (dynamicMount == null) {
+ LOG.error("Dynamic mount {} is not present for task", dynamicMountId);
+ return Result.STALE;
+ } else if (dynamicMount.state() != DynamicMount.State.PENDING) {
+ LOG.error("Dynamic mount {} is in wrong status: {}", dynamicMount.id(), dynamicMount.state());
+ return Result.STALE;
+ }
+ return Result.success(new MountDynamicDiskAction(vm, dynamicMount, allocationContext, opTask, operationTaskDao,
+ leaseDuration, taskScheduler));
+ }
+
+ @Override
+ public String type() {
+ return TYPE;
+ }
+
+}
diff --git a/lzy/allocator/src/main/resources/db/allocator/migrations/V10__operation_task.sql b/lzy/allocator/src/main/resources/db/allocator/migrations/V10__operation_task.sql
new file mode 100644
index 0000000000..8149674abc
--- /dev/null
+++ b/lzy/allocator/src/main/resources/db/allocator/migrations/V10__operation_task.sql
@@ -0,0 +1,21 @@
+CREATE TYPE task_status AS ENUM ('PENDING', 'RUNNING', 'FAILED', 'FINISHED', 'STALE');
+
+CREATE TYPE task_type AS ENUM ('UNMOUNT', 'MOUNT');
+
+CREATE TABLE IF NOT EXISTS operation_task(
+ id BIGSERIAL NOT NULL,
+ name TEXT NOT NULL,
+ entity_id TEXT NOT NULL,
+ type task_type NOT NULL,
+ status task_status NOT NULL,
+ created_at TIMESTAMP NOT NULL,
+ updated_at TIMESTAMP NOT NULL,
+ metadata JSONB NOT NULL,
+ operation_id TEXT,
+ worker_id TEXT,
+ lease_till TIMESTAMP,
+ PRIMARY KEY (id),
+ FOREIGN KEY (operation_id) REFERENCES operation(id)
+);
+
+CREATE INDEX IF NOT EXISTS task_status_entity_id_idx ON operation_task(status, entity_id, id);
diff --git a/lzy/long-running/pom.xml b/lzy/long-running/pom.xml
index 7930f9992d..634dbb4d59 100644
--- a/lzy/long-running/pom.xml
+++ b/lzy/long-running/pom.xml
@@ -45,6 +45,16 @@
junit
test
+
+ io.zonky.test
+ embedded-postgres
+ test
+
+
+ org.mockito
+ mockito-core
+ test
+
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/OperationRunnerBase.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/OperationRunnerBase.java
index 434392b044..61ef5bcda7 100644
--- a/lzy/long-running/src/main/java/ai/lzy/longrunning/OperationRunnerBase.java
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/OperationRunnerBase.java
@@ -32,6 +32,7 @@ public abstract class OperationRunnerBase extends ContextAwareTask {
private final OperationDao operationsDao;
private final OperationsExecutor executor;
private Operation op;
+ private volatile boolean failed = false;
protected OperationRunnerBase(String id, String descr, Storage storage, OperationDao operationsDao,
OperationsExecutor executor)
@@ -55,6 +56,7 @@ protected final void execute() {
}
for (var step : steps()) {
+ beforeStep();
final var stepResult = step.get();
switch (stepResult.code()) {
case ALREADY_DONE -> { }
@@ -83,6 +85,7 @@ protected final void execute() {
}
}
} catch (Throwable e) {
+ setFailed();
notifyFinished();
if (e instanceof Error err && isInjectedError(err)) {
log.error("{} Terminated by InjectedFailure exception: {}", logPrefix, e.getMessage());
@@ -98,6 +101,18 @@ protected final void execute() {
}
}
+ protected void setFailed() {
+ failed = true;
+ }
+
+ protected boolean isFailed() {
+ return failed;
+ }
+
+ protected void beforeStep() {
+
+ }
+
protected Map prepareLogContext() {
var ctx = super.prepareLogContext();
ctx.put(LogContextKey.OPERATION_ID, id);
@@ -276,6 +291,7 @@ protected void notifyFinished() {
}
protected final void failOperation(Status status, @Nullable TransactionHandle tx) throws SQLException {
+ setFailed();
operationsDao.fail(id, toProto(status), tx);
}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/DispatchingOperationTaskResolver.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/DispatchingOperationTaskResolver.java
new file mode 100644
index 0000000000..8656b72de2
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/DispatchingOperationTaskResolver.java
@@ -0,0 +1,56 @@
+package ai.lzy.longrunning.task;
+
+import ai.lzy.model.db.TransactionHandle;
+import com.google.common.annotations.VisibleForTesting;
+import jakarta.annotation.Nullable;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.sql.SQLException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class DispatchingOperationTaskResolver implements OperationTaskResolver {
+
+ private static final Logger LOG = LogManager.getLogger(DispatchingOperationTaskResolver.class);
+ private final Map resolvers;
+
+ public DispatchingOperationTaskResolver(List resolvers) {
+ this.resolvers = generateResolversMap(resolvers);
+ }
+
+ private static Map generateResolversMap(
+ List resolvers)
+ {
+ var types = new HashSet();
+ resolvers.forEach(r -> {
+ if (!types.add(r.type())) {
+ throw new IllegalStateException("Duplicate resolver for type " + r.type());
+ }
+ });
+ return resolvers.stream()
+ .collect(Collectors.toMap(TypedOperationTaskResolver::type, r -> r));
+ }
+
+ @VisibleForTesting
+ void addResolver(TypedOperationTaskResolver resolver) {
+ resolvers.put(resolver.type(), resolver);
+ }
+
+ @Override
+ public Result resolve(OperationTask operationTask, @Nullable TransactionHandle tx) throws SQLException {
+ var resolver = resolvers.get(operationTask.type());
+ if (resolver == null) {
+ LOG.error("No resolver for task type {}. Task: {}", operationTask.type(), operationTask);
+ return Result.UNKNOWN_TASK;
+ }
+ try {
+ return resolver.resolve(operationTask, tx);
+ } catch (Exception e) {
+ LOG.error("Error while resolving task {}", operationTask.id(), e);
+ return Result.resolveError(e);
+ }
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OpTaskAwareAction.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OpTaskAwareAction.java
new file mode 100644
index 0000000000..d202521fd2
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OpTaskAwareAction.java
@@ -0,0 +1,77 @@
+package ai.lzy.longrunning.task;
+
+import ai.lzy.longrunning.OperationRunnerBase;
+import ai.lzy.longrunning.OperationsExecutor;
+import ai.lzy.longrunning.dao.OperationDao;
+import ai.lzy.longrunning.task.dao.OperationTaskDao;
+import ai.lzy.model.db.Storage;
+
+import java.time.Duration;
+import java.util.Map;
+
+import static ai.lzy.model.db.DbHelper.withRetries;
+
+public abstract class OpTaskAwareAction extends OperationRunnerBase {
+ private final OperationTaskScheduler operationTaskScheduler;
+ private final OperationTaskDao operationTaskDao;
+ private final Duration leaseDuration;
+ private OperationTask operationTask;
+
+ public OpTaskAwareAction(OperationTask operationTask, OperationTaskDao operationTaskDao, Duration leaseDuration,
+ String opId, String desc, Storage storage, OperationDao operationsDao,
+ OperationsExecutor executor, OperationTaskScheduler operationTaskScheduler)
+ {
+ super(opId, desc, storage, operationsDao, executor);
+ this.operationTask = operationTask;
+ this.operationTaskDao = operationTaskDao;
+ this.leaseDuration = leaseDuration;
+ this.operationTaskScheduler = operationTaskScheduler;
+ }
+
+ @Override
+ protected Map prepareLogContext() {
+ var ctx = super.prepareLogContext();
+ ctx.put("task_id", String.valueOf(operationTask.id()));
+ ctx.put("task_type", operationTask.type());
+ ctx.put("task_name", operationTask.name());
+ ctx.put("task_entity_id", operationTask.entityId());
+ return ctx;
+ }
+
+ protected OperationTask task() {
+ return operationTask;
+ }
+
+ public void setTask(OperationTask operationTask) {
+ this.operationTask = operationTask;
+ }
+
+ @Override
+ protected void beforeStep() {
+ super.beforeStep();
+ try {
+ operationTask = withRetries(log(), () -> operationTaskDao.updateLease(operationTask.id(), leaseDuration,
+ null));
+ } catch (Exception e) {
+ log().error("{} Couldn't update lease on task {}", logPrefix(), task().id());
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ protected void notifyFinished() {
+ var builder = OperationTask.Update.builder();
+ if (isFailed()) {
+ builder.status(OperationTask.Status.FAILED);
+ } else {
+ builder.status(OperationTask.Status.FINISHED);
+ }
+ try {
+ operationTask = withRetries(log(), () -> operationTaskDao.update(operationTask.id(), builder.build(),
+ null));
+ } catch (Exception e) {
+ log().error("{} Couldn't finish operation task {}", logPrefix(), task().id());
+ }
+ operationTaskScheduler.releaseTask(task());
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTask.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTask.java
new file mode 100644
index 0000000000..8e4ef3dbaf
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTask.java
@@ -0,0 +1,77 @@
+package ai.lzy.longrunning.task;
+
+import jakarta.annotation.Nullable;
+
+import java.time.Instant;
+import java.util.Map;
+
+public record OperationTask(
+ long id,
+ String name,
+ String entityId,
+ String type,
+ Status status,
+ Instant createdAt,
+ Instant updatedAt,
+ Map metadata,
+ @Nullable
+ String operationId,
+ @Nullable
+ String workerId,
+ @Nullable
+ Instant leaseTill
+) {
+ public static OperationTask createPending(String name, String entityId, String type, Map metadata,
+ String operationId)
+ {
+ return new OperationTask(-1, name, entityId, type, Status.PENDING, Instant.now(), Instant.now(),
+ metadata, operationId, null, null);
+ }
+
+ public enum Status {
+ PENDING, //just created and waiting to be executed
+ RUNNING, //acquired by one of the instance of app and executing
+ FINISHED, //successful finish
+ FAILED, //unrecoverable failure
+ STALE, //executed too late and not applicable anymore
+ }
+
+ public record Update(
+ @Nullable Status status,
+ @Nullable Map metadata,
+ @Nullable String operationId
+ ) {
+ public boolean isEmpty() {
+ return status == null && metadata == null && operationId == null;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+ private Status status;
+ private Map metadata;
+ private String operationId;
+
+ public Builder status(Status status) {
+ this.status = status;
+ return this;
+ }
+
+ public Builder metadata(Map metadata) {
+ this.metadata = metadata;
+ return this;
+ }
+
+ public Builder operationId(String operationId) {
+ this.operationId = operationId;
+ return this;
+ }
+
+ public Update build() {
+ return new Update(status, metadata, operationId);
+ }
+ }
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskResolver.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskResolver.java
new file mode 100644
index 0000000000..bc0208c555
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskResolver.java
@@ -0,0 +1,36 @@
+package ai.lzy.longrunning.task;
+
+import ai.lzy.model.db.TransactionHandle;
+import jakarta.annotation.Nullable;
+
+import java.sql.SQLException;
+
+public interface OperationTaskResolver {
+ Result resolve(OperationTask operationTask, @Nullable TransactionHandle tx) throws SQLException;
+
+ enum Status {
+ SUCCESS,
+ STALE,
+ BAD_STATE,
+ UNKNOWN_TASK,
+ RESOLVE_ERROR,
+ }
+
+ record Result(
+ @Nullable OpTaskAwareAction action,
+ Status status,
+ @Nullable Exception exception
+ ) {
+ public static final Result STALE = new Result(null, Status.STALE, null);
+ public static final Result BAD_STATE = new Result(null, Status.BAD_STATE, null);
+ public static final Result UNKNOWN_TASK = new Result(null, Status.UNKNOWN_TASK, null);
+
+ public static Result success(OpTaskAwareAction action) {
+ return new Result(action, Status.SUCCESS, null);
+ }
+
+ public static Result resolveError(Exception e) {
+ return new Result(null, Status.RESOLVE_ERROR, e);
+ }
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskScheduler.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskScheduler.java
new file mode 100644
index 0000000000..83124bd6d1
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/OperationTaskScheduler.java
@@ -0,0 +1,232 @@
+package ai.lzy.longrunning.task;
+
+import ai.lzy.longrunning.OperationsExecutor;
+import ai.lzy.longrunning.task.dao.OperationTaskDao;
+import ai.lzy.model.db.DbHelper;
+import ai.lzy.model.db.Storage;
+import ai.lzy.model.db.TransactionHandle;
+import jakarta.annotation.Nullable;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.sql.SQLException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class OperationTaskScheduler {
+
+ private static final Logger LOG = LogManager.getLogger(OperationTaskScheduler.class);
+
+ private final OperationTaskDao opTaskDao;
+ private final OperationsExecutor operationsExecutor;
+ private final OperationTaskResolver resolver;
+ private final ScheduledExecutorService scheduler;
+ private final Duration initialDelay;
+ private final Duration executionDelay;
+ private final Storage storage;
+ private final TaskMetricsProvider metricsProvider;
+ private final String instanceId;
+ private final Duration leaseDuration;
+ private final int batchSize;
+ private final AtomicInteger runningTaskQuota = new AtomicInteger();
+
+ private final AtomicBoolean started = new AtomicBoolean(false);
+ private volatile boolean disabled = false;
+
+ public OperationTaskScheduler(OperationTaskDao opTaskDao, OperationsExecutor operationsExecutor,
+ OperationTaskResolver resolver, Duration initialDelay, Duration executionDelay,
+ Storage storage, TaskMetricsProvider metricsProvider, String instanceId,
+ Duration leaseDuration, int batchSize, int maxRunningTasks)
+ {
+ this.opTaskDao = opTaskDao;
+ this.operationsExecutor = operationsExecutor;
+ this.resolver = resolver;
+ this.initialDelay = initialDelay;
+ this.executionDelay = executionDelay;
+ this.storage = storage;
+ this.leaseDuration = leaseDuration;
+ this.batchSize = batchSize;
+ //it's important to have only one thread to execute all command subsequently
+ this.scheduler = Executors.newSingleThreadScheduledExecutor();
+ this.metricsProvider = metricsProvider;
+ this.instanceId = instanceId;
+ this.runningTaskQuota.set(maxRunningTasks);
+ }
+
+ public void start() {
+ if (!started.compareAndSet(false, true)) {
+ throw new IllegalStateException("Task executor has already started!");
+ }
+ restoreTasks();
+ startMailLoop();
+ }
+
+ private void acquireTask() {
+ acquireTasks(1);
+ }
+
+ private void acquireTasks(int count) {
+ var newQuota = runningTaskQuota.addAndGet(-count);
+ LOG.debug("Acquired {} tasks to run. Current quota: {}", count, newQuota);
+ metricsProvider.runningTasks().inc(count);
+ }
+
+ public void releaseTask(OperationTask task) {
+ LOG.debug("Finishing task {}", task.id());
+ runningTaskQuota.incrementAndGet();
+ metricsProvider.runningTasks().dec();
+ }
+
+ private ScheduledFuture> startMailLoop() {
+ return scheduler.scheduleWithFixedDelay(() -> {
+ try {
+ var actions = new ArrayList();
+ if (!hasQuota()) {
+ LOG.info("Not enough quota to start new operation tasks");
+ return;
+ }
+ var quota = runningTaskQuota.get();
+ var toLoad = Math.min(batchSize, quota);
+ DbHelper.withRetries(LOG, () -> {
+ try (var tx = TransactionHandle.create(storage)) {
+ for (OperationTask operationTask : opTaskDao.lockPendingBatch(instanceId, leaseDuration,
+ toLoad, tx))
+ {
+ if (disabled) {
+ return;
+ }
+ var taskAwareAction = resolveTask(operationTask, tx);
+ if (taskAwareAction != null) {
+ actions.add(taskAwareAction);
+ }
+ }
+ tx.commit();
+ }
+ });
+ acquireTasks(actions.size());
+ actions.forEach(operationsExecutor::startNew);
+ } catch (Exception e) {
+ LOG.error("Got exception while scheduling task", e);
+ metricsProvider.schedulerErrors().inc();
+ }
+ }, initialDelay.toMillis(), executionDelay.toMillis(), TimeUnit.MILLISECONDS);
+ }
+
+ @Nullable
+ private OpTaskAwareAction resolveTask(OperationTask operationTask, TransactionHandle tx) throws SQLException {
+ var resolveResult = resolver.resolve(operationTask, tx);
+ if (resolveResult.status() != OperationTaskResolver.Status.SUCCESS) {
+ metricsProvider.schedulerResolveErrors(resolveResult.status()).inc();
+ }
+ switch (resolveResult.status()) {
+ case SUCCESS -> {
+ var updatedTask = setStatus(operationTask, OperationTask.Status.RUNNING, tx);
+ var action = resolveResult.action();
+ assert action != null;
+ action.setTask(updatedTask);
+ return action;
+ }
+ case STALE -> {
+ LOG.warn("Marking task {} as STALE", operationTask.id(), resolveResult.exception());
+ setStatus(operationTask, OperationTask.Status.STALE, tx);
+ }
+ case BAD_STATE -> {
+ LOG.error("Marking task {} as FAILED", operationTask.id(), resolveResult.exception());
+ setStatus(operationTask, OperationTask.Status.FAILED, tx);
+ }
+ case UNKNOWN_TASK, RESOLVE_ERROR -> {
+ LOG.warn("Couldn't resolve task {}", operationTask.id(), resolveResult.exception());
+ }
+ }
+ return null;
+ }
+
+ private void restoreTasks() {
+ scheduler.schedule(() -> {
+ try {
+ var actionsToRun = new ArrayList();
+ //allow over-quoting to complete unfinished tasks first
+ DbHelper.withRetries(LOG, () -> {
+ try (var tx = TransactionHandle.create(storage)) {
+ var tasks = opTaskDao.recaptureOldTasks(instanceId, leaseDuration, tx);
+ for (OperationTask operationTask : tasks) {
+ var taskAwareAction = resolveTask(operationTask, tx);
+ if (taskAwareAction != null) {
+ actionsToRun.add(taskAwareAction);
+ }
+ }
+ tx.commit();
+ }
+ });
+ acquireTasks(actionsToRun.size());
+ actionsToRun.forEach(operationsExecutor::startNew);
+ } catch (Exception e) {
+ LOG.error("Got exception while restoring tasks", e);
+ metricsProvider.schedulerErrors().inc();
+ }
+ }, 0, TimeUnit.MILLISECONDS);
+ }
+
+ private OperationTask setStatus(OperationTask operationTask, OperationTask.Status status, TransactionHandle tx)
+ throws SQLException
+ {
+ return opTaskDao.update(operationTask.id(), OperationTask.Update.builder()
+ .status(status)
+ .build(), tx);
+ }
+
+ public OperationTask saveTask(OperationTask operationTask, @Nullable TransactionHandle tx) throws SQLException {
+ return opTaskDao.insert(operationTask, tx);
+ }
+
+ public ScheduledFuture startImmediately(OperationTask opTask) {
+ //schedule runnable to start task immediately.
+ //if task is already captured by main loop, this runnable will exit
+ return scheduler.schedule(() -> {
+ try {
+ if (!hasQuota()) {
+ LOG.debug("Not enough quota to start immediate operation task {}", opTask.id());
+ return false;
+ }
+ var action = DbHelper.withRetries(LOG, () -> {
+ try (var tx = TransactionHandle.create(storage)) {
+ var lockedTask = opTaskDao.tryLockTask(opTask.id(),
+ opTask.entityId(), instanceId, leaseDuration,
+ tx);
+ if (lockedTask == null) {
+ return null;
+ }
+ var taskAwareAction = resolveTask(lockedTask, tx);
+ tx.commit();
+ return taskAwareAction;
+ }
+ });
+ if (action == null) {
+ return false;
+ }
+ acquireTask();
+ operationsExecutor.startNew(action);
+ return true;
+ } catch (Exception e) {
+ LOG.error("Got exception while scheduling task", e);
+ metricsProvider.schedulerErrors().inc();
+ return false;
+ }
+ }, 0, TimeUnit.MILLISECONDS);
+ }
+
+ private boolean hasQuota() {
+ return runningTaskQuota.get() > 0;
+ }
+
+ public void shutdown() {
+ disabled = true;
+ scheduler.shutdown();
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/ResolverUtils.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/ResolverUtils.java
new file mode 100644
index 0000000000..a2424047be
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/ResolverUtils.java
@@ -0,0 +1,19 @@
+package ai.lzy.longrunning.task;
+
+import jakarta.annotation.Nullable;
+
+import java.util.Map;
+
+public final class ResolverUtils {
+ private ResolverUtils() {
+ }
+
+ @Nullable
+ public static String readString(Map meta, String key) {
+ var obj = meta.get(key);
+ if (obj instanceof String s) {
+ return s;
+ }
+ return null;
+ }
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TaskMetricsProvider.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TaskMetricsProvider.java
new file mode 100644
index 0000000000..448ebb6b89
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TaskMetricsProvider.java
@@ -0,0 +1,10 @@
+package ai.lzy.longrunning.task;
+
+import io.prometheus.client.Gauge;
+
+public interface TaskMetricsProvider {
+ Gauge schedulerErrors();
+ Gauge schedulerResolveErrors(OperationTaskResolver.Status status);
+ Gauge queueSize();
+ Gauge runningTasks();
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TypedOperationTaskResolver.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TypedOperationTaskResolver.java
new file mode 100644
index 0000000000..4a4f22bd97
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/TypedOperationTaskResolver.java
@@ -0,0 +1,5 @@
+package ai.lzy.longrunning.task;
+
+public interface TypedOperationTaskResolver extends OperationTaskResolver {
+ String type();
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDao.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDao.java
new file mode 100644
index 0000000000..56dd5a76b7
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDao.java
@@ -0,0 +1,38 @@
+package ai.lzy.longrunning.task.dao;
+
+import ai.lzy.longrunning.task.OperationTask;
+import ai.lzy.model.db.TransactionHandle;
+import jakarta.annotation.Nullable;
+
+import java.sql.SQLException;
+import java.time.Duration;
+import java.util.List;
+
+public interface OperationTaskDao {
+
+ @Nullable
+ OperationTask get(long id, @Nullable TransactionHandle tx) throws SQLException;
+
+ @Nullable
+ OperationTask update(long id, OperationTask.Update update, @Nullable TransactionHandle tx) throws SQLException;
+
+ @Nullable
+ OperationTask updateLease(long id, Duration duration, @Nullable TransactionHandle tx) throws SQLException;
+
+ @Nullable
+ OperationTask insert(OperationTask operationTask, @Nullable TransactionHandle tx) throws SQLException;
+
+ void delete(long id, @Nullable TransactionHandle tx) throws SQLException;
+
+ List lockPendingBatch(String ownerId, Duration leaseDuration, int batchSize,
+ @Nullable TransactionHandle tx) throws SQLException;
+
+ List recaptureOldTasks(String ownerId, Duration leaseDuration, @Nullable TransactionHandle tx)
+ throws SQLException;
+
+ @Nullable
+ OperationTask tryLockTask(Long taskId, String entityId, String ownerId, Duration leaseDuration,
+ @Nullable TransactionHandle tx) throws SQLException;
+
+ List getAll(@Nullable TransactionHandle tx) throws SQLException;
+}
diff --git a/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDaoImpl.java b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDaoImpl.java
new file mode 100644
index 0000000000..93cb23912b
--- /dev/null
+++ b/lzy/long-running/src/main/java/ai/lzy/longrunning/task/dao/OperationTaskDaoImpl.java
@@ -0,0 +1,319 @@
+package ai.lzy.longrunning.task.dao;
+
+import ai.lzy.longrunning.task.OperationTask;
+import ai.lzy.model.db.DbOperation;
+import ai.lzy.model.db.Storage;
+import ai.lzy.model.db.TransactionHandle;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import jakarta.annotation.Nullable;
+
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class OperationTaskDaoImpl implements OperationTaskDao {
+
+ private static final String FIELDS = "id, name, entity_id, type, status, created_at, updated_at, metadata," +
+ " operation_id, worker_id, lease_till";
+ public static final String SELECT_QUERY = "SELECT %s FROM operation_task WHERE id = ?".formatted(FIELDS);
+ public static final String INSERT_QUERY = """
+ INSERT INTO operation_task (name, entity_id, type, status, created_at, updated_at, metadata, operation_id)
+ VALUES (?, ?, cast(? as task_type), cast(? as task_status), now(), now(), cast(? as jsonb), ?)
+ RETURNING %s;
+ """.formatted(FIELDS);
+ public static final String GET_ALL_QUERY = """
+ SELECT %s FROM operation_task
+ """.formatted(FIELDS);
+
+ //in first nested request we gather all tasks that either locked or free.
+ //in second nested request we filter result of previous request to get only free tasks
+ //and select only specific amount.
+ private static final String LOCK_PENDING_BATCH_QUERY = """
+ UPDATE operation_task
+ SET status = 'RUNNING', worker_id = ?, updated_at = now(), lease_till = now() + cast(? as interval)
+ WHERE id IN (
+ SELECT id
+ FROM operation_task
+ WHERE id IN (
+ SELECT DISTINCT ON (entity_id) id
+ FROM operation_task
+ WHERE status IN ('PENDING', 'RUNNING')
+ ORDER BY entity_id, id
+ ) AND status = 'PENDING'
+ LIMIT ?
+ )
+ RETURNING %s;
+ """.formatted(FIELDS);
+
+ private static final String RECAPTURE_OLD_TASKS_QUERY = """
+ UPDATE operation_task
+ SET updated_at = now(), lease_till = now() + cast(? as interval)
+ WHERE status = 'RUNNING' AND worker_id = ?
+ RETURNING %s;
+ """.formatted(FIELDS);
+
+ //in first nested request we select all tasks by entity_id that are either locked or free and take the first one.
+ //in second nested request we filter result of previous request to get only pending task with specific id.
+ //if we get the task then we lock it and return it.
+ private static final String TRY_LOCK_TASK_QUERY = """
+ UPDATE operation_task
+ SET status = 'RUNNING', worker_id = ?, updated_at = now(), lease_till = now() + cast(? as interval)
+ WHERE id IN (
+ SELECT id
+ FROM operation_task
+ WHERE id IN (
+ SELECT id
+ FROM operation_task
+ WHERE status IN ('PENDING', 'RUNNING') AND entity_id = ?
+ ORDER BY id
+ LIMIT 1
+ ) AND status = 'PENDING' AND id = ?
+ )
+ RETURNING %s;
+ """.formatted(FIELDS);
+
+ public static final String DELETE_QUERY = "DELETE FROM operation_task WHERE id = ?";
+ public static final String UPDATE_LEASE_QUERY = """
+ UPDATE operation_task
+ SET lease_till = now() + cast(? as interval)
+ WHERE id = ?
+ RETURNING %s
+ """.formatted(FIELDS);
+ private static final TypeReference