add fair semaphore

sanderegg · sanderegg · commit 3be98cd741c9 · 2025-09-21T18:35:00.000+02:00
diff --git a/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/acquire_fair_semaphore_v2.lua
@@ -0,0 +1,55 @@
+-- Fair distributed semaphore using token pool (BRPOP-based)
+-- KEYS[1]: tokens_key (LIST of available tokens)
+-- KEYS[2]: holders_key (SET of current holder instance IDs)
+-- KEYS[3]: holder_key (individual holder TTL key for this instance)
+-- ARGV[1]: instance_id
+-- ARGV[2]: capacity (max concurrent holders)
+-- ARGV[3]: ttl_seconds
+-- ARGV[4]: timeout_seconds (for BRPOP)
+--
+-- Returns: {exit_code, status, token, current_count}
+-- exit_code: 0 if acquired, 255 if timeout/failed
+-- status: 'acquired' or 'timeout'
+
+local tokens_key = KEYS[1]
+local holders_key = KEYS[2]
+local holder_key = KEYS[3]
+
+local instance_id = ARGV[1]
+local capacity = tonumber(ARGV[2])
+local ttl_seconds = tonumber(ARGV[3])
+local timeout_seconds = tonumber(ARGV[4])
+
+-- Step 1: Initialize token pool if needed (first time setup)
+local tokens_exist = redis.call('EXISTS', tokens_key)
+if tokens_exist == 0 then
+    -- Initialize with capacity number of tokens
+    for i = 1, capacity do
+        redis.call('LPUSH', tokens_key, 'token_' .. i)
+    end
+    -- Set expiry on tokens list to prevent infinite growth
+    redis.call('EXPIRE', tokens_key, ttl_seconds * 10)
+end
+
+-- Step 2: Try to get a token using blocking pop
+-- timeout_seconds = 0 means block indefinitely
+local token_result = redis.call('BRPOP', tokens_key, timeout_seconds)
+
+if token_result == false or token_result == nil then
+    -- Timeout occurred
+    local current_count = redis.call('SCARD', holders_key)
+    return {255, 'timeout', '', current_count}
+end
+
+local token = token_result[2]  -- BRPOP returns {key, value}
+
+-- Step 3: Register as holder
+redis.call('SADD', holders_key, instance_id)
+redis.call('SETEX', holder_key, ttl_seconds, token)
+
+-- Step 4: Set expiry on holders set to prevent infinite growth
+redis.call('EXPIRE', holders_key, ttl_seconds * 10)
+
+local current_count = redis.call('SCARD', holders_key)
+
+return {0, 'acquired', token, current_count}
diff --git a/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/cleanup_fair_semaphore_v2.lua
@@ -0,0 +1,66 @@
+-- Cleanup orphaned tokens from crashed clients
+-- KEYS[1]: tokens_key (LIST of available tokens)
+-- KEYS[2]: holders_key (SET of current holders)
+-- KEYS[3]: holder_prefix (prefix for holder keys, e.g. "semaphores:holders:key:")
+-- ARGV[1]: capacity (total semaphore capacity)
+--
+-- Returns: {recovered_tokens, current_holders, available_tokens, total_cleaned}
+-- This script should be run periodically to recover tokens from crashed clients
+
+local tokens_key = KEYS[1]
+local holders_key = KEYS[2]
+local holder_prefix = KEYS[3]
+
+local capacity = tonumber(ARGV[1])
+
+-- Step 1: Get all current holders
+local current_holders = redis.call('SMEMBERS', holders_key)
+local recovered_tokens = 0
+local cleaned_holders = {}
+
+-- Step 2: Check each holder to see if their TTL key still exists
+for i = 1, #current_holders do
+    local holder_id = current_holders[i]
+    local holder_key = holder_prefix .. holder_id
+    local exists = redis.call('EXISTS', holder_key)
+
+    if exists == 0 then
+        -- Holder key doesn't exist but holder is in SET
+        -- This indicates a crashed client - clean up and recover token
+        redis.call('SREM', holders_key, holder_id)
+        redis.call('LPUSH', tokens_key, 'token_recovered_' .. holder_id)
+        recovered_tokens = recovered_tokens + 1
+        table.insert(cleaned_holders, holder_id)
+    end
+end
+
+-- Step 3: Ensure we have the correct total number of tokens
+local remaining_holders = redis.call('SCARD', holders_key)
+local available_tokens_count = redis.call('LLEN', tokens_key)
+local total_tokens = remaining_holders + available_tokens_count
+
+-- If we're missing tokens (due to crashes or Redis issues), add them back
+local missing_tokens = capacity - total_tokens
+for i = 1, missing_tokens do
+    redis.call('LPUSH', tokens_key, 'token_missing_' .. i)
+    recovered_tokens = recovered_tokens + 1
+end
+
+-- If we somehow have too many tokens (shouldn't happen), remove extras
+local excess_tokens = total_tokens - capacity
+for i = 1, excess_tokens do
+    redis.call('RPOP', tokens_key)
+end
+
+-- Step 4: Refresh expiry on data structures to prevent cleanup
+local final_holders = redis.call('SCARD', holders_key)
+local final_available = redis.call('LLEN', tokens_key)
+
+if final_holders > 0 then
+    redis.call('EXPIRE', holders_key, 3600)  -- 1 hour expiry
+end
+if final_available > 0 then
+    redis.call('EXPIRE', tokens_key, 3600)   -- 1 hour expiry
+end
+
+return {recovered_tokens, final_holders, final_available, #cleaned_holders}
diff --git a/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/count_fair_semaphore_v2.lua
@@ -0,0 +1,17 @@
+-- Count current semaphore holders (simplified for token pool design)
+-- KEYS[1]: holders_key (SET of current holders)
+-- KEYS[2]: tokens_key (LIST of available tokens)
+-- ARGV[1]: capacity (total semaphore capacity)
+--
+-- Returns: {current_holders, available_tokens, total_capacity}
+
+local holders_key = KEYS[1]
+local tokens_key = KEYS[2]
+
+local capacity = tonumber(ARGV[1])
+
+-- Count current holders and available tokens
+local current_holders = redis.call('SCARD', holders_key)
+local available_tokens = redis.call('LLEN', tokens_key)
+
+return {current_holders, available_tokens, capacity}
diff --git a/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/release_fair_semaphore_v2.lua
@@ -0,0 +1,48 @@
+-- Release fair semaphore and return token to pool
+-- KEYS[1]: tokens_key (LIST of available tokens)
+-- KEYS[2]: holders_key (SET of current holders)
+-- KEYS[3]: holder_key (individual holder TTL key for this instance)
+-- ARGV[1]: instance_id
+--
+-- Returns: {exit_code, status, current_count}
+-- exit_code: 0 if released, 255 if failed
+-- status: 'released', 'not_held', or 'already_expired'
+
+local tokens_key = KEYS[1]
+local holders_key = KEYS[2]
+local holder_key = KEYS[3]
+
+local instance_id = ARGV[1]
+
+-- Step 1: Check if this instance is currently a holder
+local is_holder = redis.call('SISMEMBER', holders_key, instance_id)
+if is_holder == 0 then
+    -- Not in holders set - check if holder key exists
+    local exists = redis.call('EXISTS', holder_key)
+    if exists == 1 then
+        -- Holder key exists but not in set - clean it up
+        redis.call('DEL', holder_key)
+        return {255, 'already_expired', redis.call('SCARD', holders_key)}
+    else
+        return {255, 'not_held', redis.call('SCARD', holders_key)}
+    end
+end
+
+-- Step 2: Get the token from holder key before releasing
+local token = redis.call('GET', holder_key)
+if not token then
+    -- Fallback token if somehow missing
+    token = 'token_default'
+end
+
+-- Step 3: Release the semaphore
+redis.call('SREM', holders_key, instance_id)
+redis.call('DEL', holder_key)
+
+-- Step 4: Return token to available pool
+-- This automatically unblocks any waiting BRPOP calls
+redis.call('LPUSH', tokens_key, token)
+
+local new_count = redis.call('SCARD', holders_key)
+
+return {0, 'released', new_count}
diff --git a/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua b/packages/service-library/src/servicelib/redis/lua/renew_fair_semaphore_v2.lua
@@ -0,0 +1,40 @@
+-- Renew semaphore holder TTL (simplified for token pool design)
+-- KEYS[1]: holders_key (SET of current holders)
+-- KEYS[2]: holder_key (individual holder TTL key for this instance)
+-- ARGV[1]: instance_id
+-- ARGV[2]: ttl_seconds
+--
+-- Returns: {exit_code, status, current_count}
+-- exit_code: 0 if renewed, 255 if failed
+-- status: 'renewed', 'not_held', or 'expired'
+
+local holders_key = KEYS[1]
+local holder_key = KEYS[2]
+
+local instance_id = ARGV[1]
+local ttl_seconds = tonumber(ARGV[2])
+
+-- Step 1: Check if this instance is currently a holder
+local is_holder = redis.call('SISMEMBER', holders_key, instance_id)
+if is_holder == 0 then
+    -- Not in holders set
+    local current_count = redis.call('SCARD', holders_key)
+    return {255, 'not_held', current_count}
+end
+
+-- Step 2: Check if holder key exists (to detect if it expired)
+local exists = redis.call('EXISTS', holder_key)
+if exists == 0 then
+    -- Holder key expired - remove from set and fail renewal
+    redis.call('SREM', holders_key, instance_id)
+    local current_count = redis.call('SCARD', holders_key)
+    return {255, 'expired', current_count}
+end
+
+-- Step 3: Renew the holder key TTL
+local token = redis.call('GET', holder_key)
+redis.call('SETEX', holder_key, ttl_seconds, token)
+
+local current_count = redis.call('SCARD', holders_key)
+
+return {0, 'renewed', current_count}