Skip to content

Commit bd6d0d7

Browse files
Merge pull request #111 from ethscriptions-protocol/transient_validation_errors
Better handlng of transient validation errors
2 parents c70b9b8 + d36d05d commit bd6d0d7

File tree

6 files changed

+203
-71
lines changed

6 files changed

+203
-71
lines changed

app/jobs/validation_job.rb

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,27 @@
11
class ValidationJob < ApplicationJob
22
queue_as :validation
33

4-
# Retry up to 3 times with fixed delays to avoid the exponentially_longer issue
5-
retry_on StandardError, wait: 5.seconds, attempts: 3
4+
# Import TransientValidationError from BlockValidator
5+
TransientValidationError = BlockValidator::TransientValidationError
6+
7+
# Only retry transient errors, not all StandardError
8+
retry_on TransientValidationError,
9+
wait: ENV.fetch('VALIDATION_RETRY_WAIT_SECONDS', 5).to_i.seconds,
10+
attempts: ENV.fetch('VALIDATION_TRANSIENT_RETRIES', 5).to_i
611

712
def perform(l1_block_number, l2_block_hashes)
813
start_time = Time.current
914

10-
begin
11-
# Use the unified ValidationResult model to validate and save
12-
validation_result = ValidationResult.validate_and_save(l1_block_number, l2_block_hashes)
13-
14-
elapsed_time = Time.current - start_time
15-
Rails.logger.info "ValidationJob: Block #{l1_block_number} validation completed in #{elapsed_time.round(3)}s"
16-
17-
rescue => e
18-
Rails.logger.error "ValidationJob: Validation failed for block #{l1_block_number}: #{e.message}"
15+
# ValidationResult.validate_and_save will:
16+
# 1. Create ValidationResult with success: true (job succeeds)
17+
# 2. Create ValidationResult with success: false (job succeeds - real validation failure found)
18+
# 3. Raise TransientValidationError (job retries via retry_on, then fails if exhausted)
19+
ValidationResult.validate_and_save(l1_block_number, l2_block_hashes)
1920

20-
# Validation failure will be detected by import process via database query
21+
elapsed_time = Time.current - start_time
22+
Rails.logger.info "ValidationJob: Block #{l1_block_number} validation completed in #{elapsed_time.round(3)}s"
2123

22-
# Re-raise to trigger retry mechanism
23-
raise e
24-
end
24+
# Job completes successfully for cases 1 & 2
25+
# TransientValidationError will be handled by retry_on automatically
2526
end
26-
end
27+
end

app/models/validation_result.rb

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ def self.validation_gaps(start_block, end_block)
3232
connection.execute(sql, [start_block, end_block]).map { |row| row['missing_block'] }
3333
end
3434

35+
# Faster method that just counts gaps without listing them all
36+
def self.validation_gap_count(start_block, end_block)
37+
# Count how many blocks are missing in the range
38+
expected_count = end_block - start_block + 1
39+
validated_count = where(l1_block: start_block..end_block).count
40+
expected_count - validated_count
41+
end
42+
3543
def self.validation_stats(since: 1.hour.ago)
3644
results = where('validated_at >= ?', since)
3745
total = results.count
@@ -60,16 +68,17 @@ def self.validate_and_save(l1_block_number, l2_block_hashes)
6068
start_time = Time.current
6169
block_result = validator.validate_l1_block(l1_block_number, l2_block_hashes)
6270

63-
# Store comprehensive validation result with full debugging data
64-
validation_result = create_or_find_by(l1_block: l1_block_number) do |vr|
65-
vr.success = block_result.success
66-
vr.error_details = block_result.errors
67-
vr.validation_stats = {
71+
# Find or initialize - idempotent for re-runs
72+
validation_result = find_or_initialize_by(l1_block: l1_block_number)
73+
74+
validation_result.assign_attributes(
75+
success: block_result.success,
76+
error_details: block_result.errors,
77+
validation_stats: {
6878
# Basic stats
6979
success: block_result.success,
7080
l1_block: l1_block_number,
7181
l2_blocks: l2_block_hashes,
72-
validated_at: Time.current,
7382

7483
# Detailed comparison data
7584
validation_details: block_result.stats,
@@ -80,26 +89,41 @@ def self.validate_and_save(l1_block_number, l2_block_hashes)
8089

8190
# Timing info
8291
validation_duration_ms: ((Time.current - start_time) * 1000).round(2)
83-
}
84-
vr.validated_at = Time.current
85-
end
92+
},
93+
validated_at: Time.current
94+
)
95+
96+
validation_result.save!
8697

8798
# Log the result
8899
validation_result.log_summary
89100

90101
validation_result
102+
rescue BlockValidator::TransientValidationError => e
103+
# Don't persist transient errors - let ValidationJob handle retries
104+
Rails.logger.debug "ValidationResult: Transient error for block #{l1_block_number}: #{e.message}"
105+
raise e
91106
rescue => e
92107
Rails.logger.error "ValidationResult: Exception validating block #{l1_block_number}: #{e.message}"
93108

94-
# Record the validation error
95-
error_result = create_or_find_by(l1_block: l1_block_number) do |vr|
96-
vr.success = false
97-
vr.error_details = [e.message]
98-
vr.validation_stats = { exception: true, exception_class: e.class.name }
99-
vr.validated_at = Time.current
100-
end
109+
# Only persist non-transient validation errors - idempotent for re-runs
110+
validation_result = find_or_initialize_by(l1_block: l1_block_number)
111+
112+
validation_result.assign_attributes(
113+
success: false,
114+
error_details: [e.message],
115+
validation_stats: {
116+
exception: true,
117+
exception_class: e.class.name,
118+
exception_message: e.message,
119+
exception_backtrace: e.backtrace&.first(10) # Store first 10 lines of backtrace
120+
},
121+
validated_at: Time.current
122+
)
101123

102-
error_result.log_summary
124+
validation_result.save!
125+
126+
validation_result.log_summary
103127
raise e
104128
end
105129
end

app/services/eth_block_importer.rb

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ class BlockNotReadyToImportError < StandardError; end
88
class ReorgDetectedError < StandardError; end
99
# Raised when validation failure is detected (should stop system permanently)
1010
class ValidationFailureError < StandardError; end
11+
# Raised when validation is too far behind (should wait and retry)
12+
class ValidationStalledError < StandardError; end
1113

1214
attr_accessor :ethscriptions_block_cache, :ethereum_client, :eth_block_cache, :geth_driver, :prefetcher
1315

@@ -196,13 +198,20 @@ def import_blocks_until_done
196198

197199
begin
198200
loop do
199-
# Check for validation failures before importing
200-
if validation_failure_detected?
201+
# Check for validation failures
202+
if real_validation_failure_detected?
201203
failed_block = get_validation_failure_block
202204
logger.error "Import stopped due to validation failure at block #{failed_block}"
203205
raise ValidationFailureError.new("Validation failure detected at block #{failed_block}")
204206
end
205207

208+
# Check if validation is stalled
209+
if validation_stalled?
210+
current_position = current_max_eth_block_number
211+
logger.warn "Import paused - validation is behind (current: #{current_position})"
212+
raise ValidationStalledError.new("Validation stalled - waiting for validation to catch up")
213+
end
214+
206215
block_number = next_block_to_import
207216

208217
if block_number.nil?
@@ -409,18 +418,42 @@ def geth_driver
409418
@geth_driver
410419
end
411420

412-
def validation_failure_detected?
413-
# Only consider failures BEHIND current import position as critical
421+
def get_validation_failure_block
422+
# Get the earliest failed block that's behind current import (for real failures)
423+
current_position = current_max_eth_block_number
424+
ValidationResult.failed.where('l1_block <= ?', current_position).order(:l1_block).first&.l1_block
425+
end
426+
427+
def real_validation_failure_detected?
428+
# Only consider real validation failures BEHIND current import position as critical
414429
current_position = current_max_eth_block_number
415430
ValidationResult.failed.where('l1_block <= ?', current_position).exists?
416431
end
417432

418-
def get_validation_failure_block
419-
# Get the earliest failed block that's behind current import
433+
def validation_stalled?
434+
return false unless ENV.fetch('VALIDATION_ENABLED').casecmp?('true')
435+
420436
current_position = current_max_eth_block_number
421-
ValidationResult.failed.where('l1_block <= ?', current_position).order(:l1_block).first&.l1_block
437+
hard_limit = ENV.fetch('VALIDATION_LAG_HARD_LIMIT', 30).to_i
438+
439+
# Check for validation gaps in recent blocks
440+
# This covers BOTH sequential lag and scattered gaps
441+
check_range_start = [current_position - hard_limit + 1, 1].max # Last N blocks
442+
check_range_end = current_position
443+
444+
# Count ALL missing validations in critical range (includes both gaps and lag)
445+
gap_count = ValidationResult.validation_gap_count(check_range_start, check_range_end)
446+
447+
if gap_count >= hard_limit
448+
Rails.logger.error "Too many validation gaps: #{gap_count} unvalidated blocks in range #{check_range_start}-#{check_range_end} (limit: #{hard_limit})"
449+
return true
450+
end
451+
452+
false
422453
end
423454

455+
public
456+
424457
def cleanup_stale_validation_records
425458
# Remove validation records AND pending jobs ahead of our starting position
426459
# These are from previous runs and may be stale due to reorgs

config/derive_ethscriptions_blocks.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,13 @@ module Clockwork
187187
puts "[#{Time.now}] Fix the validation issue and restart manually"
188188
exit 1
189189

190+
rescue EthBlockImporter::ValidationStalledError => e
191+
# Validation is behind - wait longer and keep trying
192+
Rails.logger.info "[#{Time.now}] ⏸️ VALIDATION BEHIND: #{e.message}"
193+
puts "[#{Time.now}] ⏸️ Validation is behind - waiting #{import_interval * 2}s for validation to catch up..."
194+
sleep import_interval * 2 # Wait longer when validation is behind
195+
# Don't exit - continue the loop to retry
196+
190197
rescue => e
191198
Rails.logger.error "Import error: #{e.class} - #{e.message}"
192199
Rails.logger.error e.backtrace.first(20).join("\n")

lib/block_validator.rb

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
class BlockValidator
22
attr_reader :errors, :stats
33

4+
# Exception for transient errors that should trigger retries
5+
class TransientValidationError < StandardError; end
6+
47
def initialize
58
# Initialize validation state
69
reset_validation_state
@@ -29,7 +32,7 @@ def validate_l1_block(l1_block_number, l2_block_hashes)
2932
verify_storage_state(expected, l1_block_number, historical_block_tag)
3033

3134
# Build comprehensive result with full debugging data
32-
success = @errors.empty? && !@incomplete_actual && !expected[:api_unavailable]
35+
success = @errors.empty?
3336
validation_duration = Time.current - validation_start_time
3437

3538
# Return comprehensive debugging information
@@ -45,8 +48,6 @@ def validate_l1_block(l1_block_number, l2_block_hashes)
4548
actual_transfers: Array(actual_events[:transfers]).size,
4649
storage_checks: @storage_checks_performed.value,
4750
errors_count: @errors.size,
48-
api_unavailable: expected[:api_unavailable] ? true : false,
49-
incomplete_actual: @incomplete_actual,
5051

5152
# L1 to L2 block mapping
5253
l1_to_l2_mapping: {
@@ -64,8 +65,6 @@ def validate_l1_block(l1_block_number, l2_block_hashes)
6465
raw_expected_data: {
6566
creations: sanitize_for_json(expected[:creations] || []),
6667
transfers: sanitize_for_json(expected[:transfers] || []),
67-
api_available: !expected[:api_unavailable],
68-
api_error: expected[:api_error]
6968
},
7069

7170
raw_actual_data: {
@@ -94,7 +93,6 @@ def validate_l1_block(l1_block_number, l2_block_hashes)
9493
def reset_validation_state
9594
@errors = Concurrent::Array.new
9695
@storage_checks_performed = Concurrent::AtomicFixnum.new(0)
97-
@incomplete_actual = false
9896

9997
# Reset debugging instrumentation
10098
@debug_data = {
@@ -119,10 +117,15 @@ def load_genesis_transaction_hashes
119117

120118
def fetch_expected_data(l1_block_number)
121119
EthscriptionsApiClient.fetch_block_data(l1_block_number)
120+
rescue EthscriptionsApiClient::ApiUnavailableError => e
121+
# API unavailable after exhausting all retries - this is an infrastructure issue
122+
Rails.logger.warn "API unavailable for block #{l1_block_number}: #{e.message}"
123+
raise TransientValidationError, e.message
122124
rescue => e
123-
message = "Failed to fetch API data: #{e.message}"
125+
# Other unexpected errors - log and continue with empty data
126+
message = "Unexpected error fetching API data: #{e.message}"
124127
@errors << message
125-
{creations: [], transfers: [], api_unavailable: true, api_error: message}
128+
{creations: [], transfers: []}
126129
end
127130

128131
def aggregate_l2_events(block_hashes)
@@ -134,18 +137,24 @@ def aggregate_l2_events(block_hashes)
134137
begin
135138
receipts = EthRpcClient.l2.call('eth_getBlockReceipts', [block_hash])
136139
if receipts.nil?
137-
@errors << "No receipts returned for L2 block #{block_hash}"
138-
@incomplete_actual = true
139-
raise "No receipts returned for L2 block #{block_hash}"
140+
error_msg = "No receipts returned for L2 block #{block_hash}"
141+
@errors << error_msg
142+
# Treat missing receipts as potentially transient
143+
raise TransientValidationError, error_msg
140144
end
141145

142146
data = EventDecoder.decode_block_receipts(receipts)
143147
all_creations.concat(data[:creations])
144148
all_transfers.concat(data[:transfers]) # Ethscriptions protocol transfers
145149
rescue => e
146-
@errors << "Failed to get receipts for block #{block_hash}: #{e.message}"
147-
@incomplete_actual = true
148-
raise "Failed to get receipts for block #{block_hash}: #{e.message}"
150+
error_msg = "Failed to get receipts for block #{block_hash}: #{e.message}"
151+
@errors << error_msg
152+
# Classify L2 receipt fetch errors - network issues are transient
153+
if transient_error?(e)
154+
raise TransientValidationError, error_msg
155+
else
156+
raise
157+
end
149158
end
150159
end
151160

@@ -561,6 +570,21 @@ def safe_content_preview(content, length: 50)
561570
preview + (content.length > length ? "..." : "")
562571
end
563572

573+
# Classify L2 RPC errors as transient (infrastructure) vs permanent (logic)
574+
def transient_error?(error)
575+
case error
576+
# L2 RPC network errors
577+
when SocketError, Errno::ECONNREFUSED, Errno::ECONNRESET,
578+
Net::OpenTimeout, Net::ReadTimeout, Net::TimeoutError
579+
true
580+
# L2 RPC errors that might be transient
581+
when EthRpcClient::HttpError, EthRpcClient::ApiError
582+
true
583+
else
584+
false
585+
end
586+
end
587+
564588
# Sanitize data structures for JSON serialization
565589
def sanitize_for_json(data)
566590
case data

0 commit comments

Comments
 (0)