Skip to content

Commit dbe8b01

Browse files
authored
Merge pull request #79 from figma/assign_estimation_as_ddl
[CI] use chunk estimated duration as the timeout
2 parents ae3b2f1 + eaefc8c commit dbe8b01

File tree

2 files changed

+85
-16
lines changed

2 files changed

+85
-16
lines changed

ruby/lib/ci/queue/redis/worker.rb

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,8 @@ def reserve
239239
end
240240

241241
def try_to_reserve_test
242-
eval_script(
242+
current_time = CI::Queue.time_now.to_f
243+
test_id = eval_script(
243244
:reserve,
244245
keys: [
245246
key('queue'),
@@ -249,11 +250,44 @@ def try_to_reserve_test
249250
key('owners'),
250251
key('test-group-timeout')
251252
],
252-
argv: [CI::Queue.time_now.to_f, 'true', config.timeout]
253+
argv: [current_time, 'true', config.timeout]
253254
)
255+
256+
if test_id
257+
# Check what timeout was used (dynamic or default)
258+
dynamic_timeout = redis.hget(key('test-group-timeout'), test_id)
259+
timeout_used = dynamic_timeout ? dynamic_timeout.to_f : config.timeout
260+
deadline = current_time + timeout_used
261+
gap_seconds = timeout_used
262+
gap_hours = (gap_seconds / 3600).to_i
263+
gap_mins = ((gap_seconds % 3600) / 60).to_i
264+
gap_secs = (gap_seconds % 60)
265+
266+
current_time_readable = Time.at(current_time).strftime('%Y-%m-%d %H:%M:%S')
267+
deadline_readable = Time.at(deadline).strftime('%Y-%m-%d %H:%M:%S')
268+
269+
# Format gap_seconds to 2 decimal places, and gap_secs for the formatted version
270+
gap_seconds_formatted = format('%.2f', gap_seconds)
271+
gap_secs_formatted = gap_secs < 60 ? format('%.2f', gap_secs) : gap_secs.to_i.to_s
272+
273+
# Add details about how timeout was computed
274+
timeout_details = if dynamic_timeout
275+
# For chunks, calculate back to estimated_duration (without buffer)
276+
estimated_duration_ms = (dynamic_timeout.to_f / 1.1 * 1000).round
277+
estimated_duration_seconds = estimated_duration_ms / 1000.0
278+
"dynamic_timeout=#{dynamic_timeout.to_f}s (estimated_duration=#{estimated_duration_seconds}s * 1.1 buffer)"
279+
else
280+
"default_timeout=#{config.timeout}s"
281+
end
282+
283+
warn "[reserve] test=#{test_id} current_time=#{current_time_readable} (#{current_time}) deadline=#{deadline_readable} (#{deadline}) gap=#{gap_seconds_formatted}s (#{gap_hours}h#{gap_mins}m#{gap_secs_formatted}s) [#{timeout_details}]"
284+
end
285+
286+
test_id
254287
end
255288

256289
def try_to_reserve_lost_test
290+
current_time = CI::Queue.time_now.to_f
257291
lost_test = eval_script(
258292
:reserve_lost,
259293
keys: [
@@ -263,9 +297,39 @@ def try_to_reserve_lost_test
263297
key('owners'),
264298
key('test-group-timeout')
265299
],
266-
argv: [CI::Queue.time_now.to_f, timeout, 'true', config.timeout]
300+
argv: [current_time, timeout, 'true', config.timeout]
267301
)
268302

303+
if lost_test
304+
# Check what timeout was used (dynamic or default)
305+
dynamic_timeout = redis.hget(key('test-group-timeout'), lost_test)
306+
timeout_used = dynamic_timeout ? dynamic_timeout.to_f : config.timeout
307+
deadline = current_time + timeout_used
308+
gap_seconds = timeout_used
309+
gap_hours = (gap_seconds / 3600).to_i
310+
gap_mins = ((gap_seconds % 3600) / 60).to_i
311+
gap_secs = (gap_seconds % 60)
312+
313+
current_time_readable = Time.at(current_time).strftime('%Y-%m-%d %H:%M:%S')
314+
deadline_readable = Time.at(deadline).strftime('%Y-%m-%d %H:%M:%S')
315+
316+
# Format gap_seconds to 2 decimal places, and gap_secs for the formatted version
317+
gap_seconds_formatted = format('%.2f', gap_seconds)
318+
gap_secs_formatted = gap_secs < 60 ? format('%.2f', gap_secs) : gap_secs.to_i.to_s
319+
320+
# Add details about how timeout was computed
321+
timeout_details = if dynamic_timeout
322+
# For chunks, calculate back to estimated_duration (without buffer)
323+
estimated_duration_ms = (dynamic_timeout.to_f / 1.1 * 1000).round
324+
estimated_duration_seconds = estimated_duration_ms / 1000.0
325+
"dynamic_timeout=#{dynamic_timeout.to_f}s (estimated_duration=#{estimated_duration_seconds}s * 1.1 buffer)"
326+
else
327+
"default_timeout=#{config.timeout}s"
328+
end
329+
330+
warn "[reserve_lost] test=#{lost_test} current_time=#{current_time_readable} (#{current_time}) deadline=#{deadline_readable} (#{deadline}) gap=#{gap_seconds_formatted}s (#{gap_hours}h#{gap_mins}m#{gap_secs_formatted}s) [#{timeout_details}]"
331+
end
332+
269333
if lost_test.nil? && idle?
270334
puts "Worker #{worker_id} could not reserve a lost test while idle"
271335
puts 'Printing running tests:'
@@ -346,9 +410,14 @@ def store_chunk_metadata(chunks)
346410
transaction.sadd(key('chunks'), chunk.id)
347411

348412
# Store dynamic timeout for this chunk
349-
# Timeout = default_timeout * number_of_tests
350-
chunk_timeout = config.timeout * chunk.test_count
351-
transaction.hset(key('test-group-timeout'), chunk.id, chunk_timeout)
413+
# Timeout = estimated_duration (in ms) converted to seconds + buffer
414+
# estimated_duration is in milliseconds, convert to seconds and add 10% buffer
415+
buffer_percent = 10
416+
estimated_duration_seconds = chunk.estimated_duration / 1000.0
417+
chunk_timeout = (estimated_duration_seconds * (1 + buffer_percent / 100.0)).round(2)
418+
# Format to string to avoid floating point precision issues in Redis
419+
# Use %g to remove trailing zeros
420+
transaction.hset(key('test-group-timeout'), chunk.id, format('%g', chunk_timeout))
352421
end
353422
transaction.expire(key('chunks'), config.redis_ttl)
354423
transaction.expire(key('test-group-timeout'), config.redis_ttl)

ruby/test/ci/queue/redis/dynamic_timeout_test.rb

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ def test_chunk_timeout_stored_in_redis_hash
3737
end
3838

3939
# Verify timeout was stored in test-group-timeout hash
40-
# Timeout should be: default_timeout (30s) * number_of_tests (3) = 90s
40+
# Timeout should be: estimated_duration (5000ms = 5s) * 1.1 buffer = 5.5s
4141
chunk_timeout = @redis.hget('build:42:test-group-timeout', 'TestA:chunk_0')
4242
refute_nil chunk_timeout
43-
assert_equal '90', chunk_timeout
43+
assert_equal '5.5', chunk_timeout
4444
end
4545

4646
def test_chunk_timeout_scales_with_test_count
@@ -55,7 +55,7 @@ def test_chunk_timeout_scales_with_test_count
5555
end
5656

5757
small_timeout = @redis.hget('build:42:test-group-timeout', 'SmallSuite:chunk_0')
58-
assert_equal '150', small_timeout # 30s * 5 tests
58+
assert_equal '1.1', small_timeout # 1000ms = 1s * 1.1 buffer = 1.1s
5959

6060
@redis.flushdb
6161

@@ -70,7 +70,7 @@ def test_chunk_timeout_scales_with_test_count
7070
end
7171

7272
large_timeout = @redis.hget('build:42:test-group-timeout', 'LargeSuite:chunk_0')
73-
assert_equal '600', large_timeout # 30s * 20 tests
73+
assert_equal '5.5', large_timeout # 5000ms = 5s * 1.1 buffer = 5.5s
7474
end
7575

7676
def test_multiple_chunks_stored_with_different_timeouts
@@ -92,9 +92,9 @@ def test_multiple_chunks_stored_with_different_timeouts
9292
end
9393

9494
# Verify each chunk has correct timeout
95-
assert_equal '60', @redis.hget('build:42:test-group-timeout', 'TestA:chunk_0') # 30s * 2
96-
assert_equal '90', @redis.hget('build:42:test-group-timeout', 'TestB:chunk_0') # 30s * 3
97-
assert_equal '30', @redis.hget('build:42:test-group-timeout', 'TestC:chunk_0') # 30s * 1
95+
assert_equal '2.2', @redis.hget('build:42:test-group-timeout', 'TestA:chunk_0') # 2000ms = 2s * 1.1 buffer = 2.2s
96+
assert_equal '3.3', @redis.hget('build:42:test-group-timeout', 'TestB:chunk_0') # 3000ms = 3s * 1.1 buffer = 3.3s
97+
assert_equal '1.1', @redis.hget('build:42:test-group-timeout', 'TestC:chunk_0') # 1000ms = 1s * 1.1 buffer = 1.1s
9898
end
9999

100100
def test_timeout_hash_has_ttl
@@ -145,8 +145,8 @@ def test_mixed_chunks_and_tests_only_chunks_have_timeouts
145145
end
146146

147147
# Chunks should have timeouts
148-
assert_equal '60', @redis.hget('build:42:test-group-timeout', 'TestA:chunk_0')
149-
assert_equal '90', @redis.hget('build:42:test-group-timeout', 'TestC:chunk_0')
148+
assert_equal '2.2', @redis.hget('build:42:test-group-timeout', 'TestA:chunk_0') # 2000ms = 2s * 1.1 buffer = 2.2s
149+
assert_equal '3.3', @redis.hget('build:42:test-group-timeout', 'TestC:chunk_0') # 3000ms = 3s * 1.1 buffer = 3.3s
150150

151151
# Individual test should not
152152
assert_nil @redis.hget('build:42:test-group-timeout', 'TestB#test_1')
@@ -296,7 +296,7 @@ def test_batching_with_many_chunks
296296
chunks.each do |chunk|
297297
timeout = @redis.hget('build:42:test-group-timeout', chunk.id)
298298
refute_nil timeout, "Chunk #{chunk.id} should have timeout stored"
299-
assert_equal '30', timeout # 30s * 1 test
299+
assert_equal '1.1', timeout # 1000ms = 1s * 1.1 buffer = 1.1s
300300
end
301301
end
302302

0 commit comments

Comments
 (0)