Skip to content

Commit f290c68

Browse files
committed
Implement request cancellation handling to prevent unnecessary VM spin-up
1 parent 871c94c commit f290c68

File tree

4 files changed

+187
-4
lines changed

4 files changed

+187
-4
lines changed

Gemfile.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ GEM
196196

197197
PLATFORMS
198198
arm64-darwin-22
199+
arm64-darwin-23
199200
universal-java-11
200201
universal-java-17
201202
x86_64-darwin-22

lib/vmpooler/pool_manager.rb

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,16 +161,70 @@ def handle_timed_out_vm(vm, pool, redis)
161161
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
162162
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
163163
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
164+
clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
165+
clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
164166
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
167+
165168
if request_id
166169
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
167170
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
168-
# will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
169-
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
171+
# Check retry count and max retry limit before retrying
172+
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
173+
max_retries = $config[:config]['max_vm_retries'] || 3
174+
175+
# Determine if error is likely permanent (configuration issues)
176+
permanent_error = is_permanent_error?(clone_error, clone_error_class)
177+
178+
if retry_count < max_retries && !permanent_error
179+
# Increment retry count and retry VM creation
180+
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
181+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
182+
$logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
183+
else
184+
# Max retries exceeded or permanent error, mark request as permanently failed
185+
failure_reason = if permanent_error
186+
"Configuration error: #{clone_error}"
187+
else
188+
'Max retry attempts exceeded'
189+
end
190+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
191+
redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
192+
$logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
193+
$metrics.increment("errors.permanently_failed.#{pool}")
194+
end
170195
end
171196
end
172197
$metrics.increment("errors.markedasfailed.#{pool}")
173-
open_socket_error
198+
open_socket_error || clone_error
199+
end
200+
201+
# Determine if an error is likely permanent (configuration issue) vs transient
202+
def is_permanent_error?(error_message, error_class)
203+
return false if error_message.nil? || error_class.nil?
204+
205+
permanent_error_patterns = [
206+
/template.*not found/i,
207+
/template.*does not exist/i,
208+
/invalid.*path/i,
209+
/folder.*not found/i,
210+
/datastore.*not found/i,
211+
/resource pool.*not found/i,
212+
/permission.*denied/i,
213+
/authentication.*failed/i,
214+
/invalid.*credentials/i,
215+
/configuration.*error/i
216+
]
217+
218+
permanent_error_classes = [
219+
'ArgumentError',
220+
'NoMethodError',
221+
'NameError'
222+
]
223+
224+
# Check error message patterns
225+
permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
226+
# Check error class types
227+
permanent_error_classes.include?(error_class)
174228
end
175229

176230
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
@@ -489,14 +543,18 @@ def _clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = ni
489543

490544
dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
491545
dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
492-
rescue StandardError
546+
rescue StandardError => e
547+
# Store error details for retry decision making
493548
@redis.with_metrics do |redis|
494549
redis.pipelined do |pipeline|
495550
pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
551+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
552+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
496553
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
497554
pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
498555
end
499556
end
557+
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
500558
raise
501559
ensure
502560
@redis.with_metrics do |redis|

spec/unit/pool_manager_spec.rb

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,123 @@
345345
end
346346
end
347347

348+
describe '#handle_timed_out_vm' do
349+
before do
350+
expect(subject).not_to be_nil
351+
end
352+
353+
before(:each) do
354+
redis_connection_pool.with do |redis|
355+
create_pending_vm(pool, vm, redis)
356+
config[:config]['max_vm_retries'] = 3
357+
end
358+
end
359+
360+
context 'without request_id' do
361+
it 'moves VM to completed queue and returns error' do
362+
redis_connection_pool.with do |redis|
363+
redis.hset("vmpooler__vm__#{vm}", 'open_socket_error', 'connection failed')
364+
result = subject.handle_timed_out_vm(vm, pool, redis)
365+
366+
expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(false)
367+
expect(redis.sismember("vmpooler__completed__#{pool}", vm)).to be(true)
368+
expect(result).to eq('connection failed')
369+
end
370+
end
371+
end
372+
373+
context 'with request_id and transient error' do
374+
before(:each) do
375+
redis_connection_pool.with do |redis|
376+
redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id)
377+
redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool)
378+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending')
379+
redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'network timeout')
380+
redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'Timeout::Error')
381+
end
382+
end
383+
384+
it 'retries on first failure' do
385+
redis_connection_pool.with do |redis|
386+
subject.handle_timed_out_vm(vm, pool, redis)
387+
388+
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count')).to eq('1')
389+
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).to include("#{pool}:#{pool}:1:#{request_id}")
390+
end
391+
end
392+
393+
it 'marks as failed after max retries' do
394+
redis_connection_pool.with do |redis|
395+
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', '3')
396+
397+
subject.handle_timed_out_vm(vm, pool, redis)
398+
399+
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed')
400+
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to eq('Max retry attempts exceeded')
401+
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}")
402+
end
403+
end
404+
end
405+
406+
context 'with request_id and permanent error' do
407+
before(:each) do
408+
redis_connection_pool.with do |redis|
409+
redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id)
410+
redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool)
411+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending')
412+
redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'template not found')
413+
redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'RuntimeError')
414+
end
415+
end
416+
417+
it 'immediately marks as failed without retrying' do
418+
redis_connection_pool.with do |redis|
419+
subject.handle_timed_out_vm(vm, pool, redis)
420+
421+
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed')
422+
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to include('Configuration error')
423+
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}")
424+
end
425+
end
426+
end
427+
end
428+
429+
describe '#is_permanent_error?' do
430+
before do
431+
expect(subject).not_to be_nil
432+
end
433+
434+
it 'identifies template not found errors as permanent' do
435+
expect(subject.is_permanent_error?('template not found', 'RuntimeError')).to be(true)
436+
end
437+
438+
it 'identifies invalid path errors as permanent' do
439+
expect(subject.is_permanent_error?('invalid path specified', 'ArgumentError')).to be(true)
440+
end
441+
442+
it 'identifies permission denied errors as permanent' do
443+
expect(subject.is_permanent_error?('permission denied', 'SecurityError')).to be(true)
444+
end
445+
446+
it 'identifies ArgumentError class as permanent' do
447+
expect(subject.is_permanent_error?('some argument error', 'ArgumentError')).to be(true)
448+
end
449+
450+
it 'identifies network errors as transient' do
451+
expect(subject.is_permanent_error?('connection timeout', 'Timeout::Error')).to be(false)
452+
end
453+
454+
it 'identifies socket errors as transient' do
455+
expect(subject.is_permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false)
456+
end
457+
458+
it 'returns false for nil inputs' do
459+
expect(subject.is_permanent_error?(nil, nil)).to be(false)
460+
expect(subject.is_permanent_error?('error', nil)).to be(false)
461+
expect(subject.is_permanent_error?(nil, 'Error')).to be(false)
462+
end
463+
end
464+
348465
describe '#move_pending_vm_to_ready' do
349466
let(:host) { { 'hostname' => vm }}
350467

vmpooler.yaml.example

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,12 @@
456456
# How long (in minutes) before marking a clone in 'pending' queues as 'failed' and retrying.
457457
# (default: 15)
458458
#
459+
# - max_vm_retries
460+
# Maximum number of times to retry VM creation for a failed request before marking it as permanently failed.
461+
# This helps prevent infinite retry loops when there are configuration issues like invalid template paths.
462+
# Permanent errors (like invalid template paths) are detected and will not be retried.
463+
# (default: 3)
464+
#
459465
# - vm_checktime
460466
# How often (in minutes) to check the sanity of VMs in 'ready' queues.
461467
# (default: 1)
@@ -619,6 +625,7 @@
619625
vm_checktime: 1
620626
vm_lifetime: 12
621627
vm_lifetime_auth: 24
628+
max_vm_retries: 3
622629
allowed_tags:
623630
- 'created_by'
624631
- 'project'

0 commit comments

Comments
 (0)