Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ gem "uk_postcode"
# For structured logging
gem "lograge"

# For distributed tracing and telemetry
gem "opentelemetry-exporter-otlp", "~> 0.31.1"
gem "opentelemetry-instrumentation-all", "~> 0.89.1"
gem "opentelemetry-propagator-xray", "~> 0.26.0"
gem "opentelemetry-sdk", "~> 1.10"

# For AWS interactions
gem "aws-sdk-cloudwatch"
gem "aws-sdk-codepipeline", "~> 1.110"
Expand Down
265 changes: 265 additions & 0 deletions Gemfile.lock

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions app/controllers/application_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ def set_request_logging_attributes
CurrentRequestLoggingAttributes.page_slug = params[:page_slug] if params[:page_slug].present?
CurrentRequestLoggingAttributes.session_id_hash = session_id_hash
CurrentRequestLoggingAttributes.trace_id = request.env["HTTP_X_AMZN_TRACE_ID"] if request.env["HTTP_X_AMZN_TRACE_ID"].present?

# Add same attributes to OpenTelemetry span for journey tracking
TelemetryService.set_request_attributes({
"session.id_hash" => session_id_hash,
"request.host" => request.host,
"request.id" => request.request_id,
"form.id" => params[:form_id],
"page.id" => params[:page_slug]&.match(Page::PAGE_ID_REGEX) ? params[:page_slug] : nil,
"page.slug" => params[:page_slug],
})
end

def log_rescued_exception(exception)
Expand Down
8 changes: 8 additions & 0 deletions app/controllers/forms/base_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ def set_request_logging_attributes
super
CurrentRequestLoggingAttributes.form_name = @form.name
CurrentRequestLoggingAttributes.preview = mode.preview?

# Add form-level attributes to OpenTelemetry span
TelemetryService.set_request_attributes({
"form.name" => @form.name,
"form.slug" => @form.form_slug,
"mode.type" => mode.to_s,
"mode.preview" => mode.preview?,
})
end

private
Expand Down
3 changes: 3 additions & 0 deletions app/controllers/forms/page_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ def set_request_logging_attributes
super
CurrentRequestLoggingAttributes.question_number = @step.page_number if @step&.page_number
CurrentRequestLoggingAttributes.answer_type = @step&.page&.answer_type if @step&.page&.answer_type

# Add question-level attributes to OpenTelemetry span
TelemetryService.set_question_attributes(@step, @form) if @step && @form
end

def show
Expand Down
10 changes: 9 additions & 1 deletion app/lib/flow/context.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,15 @@ def initialize(form:, store:)
delegate :save_submission_details, :get_submission_reference, :requested_email_confirmation?, :clear_submission_details, to: :confirmation_details_store

def save_step(step, context: nil)
return false unless step.valid?(context)
is_valid = step.valid?(context)

if is_valid
TelemetryService.record_validation_success
else
TelemetryService.record_validation_failure(step)
end

return false unless is_valid

step.save_to_store(@answer_store)
end
Expand Down
25 changes: 20 additions & 5 deletions app/services/api/v2/form_document_repository.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,26 @@ class << self
def find(form_id:, tag:, language: :en)
raise ActiveResource::ResourceNotFound.new(404, "Not Found") unless form_id.to_s =~ /^[[:alnum:]]+$/

form_document = Api::V2::FormDocumentResource.get(form_id, tag, **options_for_language(language))
form = Form.new(form_document, true)
form.document_json = form_document
form.prefix_options = { form_id:, tag: }
form
TelemetryService.trace("api.forms_admin.fetch_form", attributes: {
"api.endpoint" => "#{Settings.forms_api.base_url}/api/v2/form/#{form_id}/#{tag}",
"api.method" => "GET",
"form.id" => form_id.to_s,
"form.tag" => tag.to_s,
"form.language" => language.to_s,
}) do |span|
form_document = Api::V2::FormDocumentResource.get(form_id, tag, **options_for_language(language))

span.set_attribute("api.response.status", 200)
span.set_attribute("form.name", form_document.name) if form_document.respond_to?(:name)

form = Form.new(form_document, true)
form.document_json = form_document
form.prefix_options = { form_id:, tag: }
form
end
rescue ActiveResource::ResourceNotFound => e
# Re-raise but let the span record the error
raise
end

def find_with_mode(form_id:, mode:, language: :en)
Expand Down
79 changes: 49 additions & 30 deletions app/services/form_submission_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,22 @@ def initialize(current_context:, email_confirmation_input:, mode:)
end

def submit
ensure_form_english
validate_submission

confirmation_mail = setup_confirmation_email if requested_confirmation?
deliver_submission
send_confirmation_email(confirmation_mail) if confirmation_mail.present?

submission_reference
TelemetryService.trace("form.submission.process", attributes: {
"submission.type" => form.submission_type,
"submission.format" => form.submission_format,
"submission.reference" => submission_reference,
"form.id" => form.id.to_s,
"confirmation.requested" => requested_confirmation?,
}) do
ensure_form_english
validate_submission

confirmation_mail = setup_confirmation_email if requested_confirmation?
deliver_submission
send_confirmation_email(confirmation_mail) if confirmation_mail.present?

submission_reference
end
end

private
Expand Down Expand Up @@ -71,31 +79,42 @@ def deliver_submission
end

def deliver_submission_via_s3
s3_submission_service = S3SubmissionService.new(
journey: current_context.journey,
form: form,
timestamp: timestamp,
submission_reference: submission_reference,
is_preview: mode.preview?,
)

s3_submission_service.submit
TelemetryService.trace("form.submission.deliver_s3", attributes: {
"submission.reference" => submission_reference,
"submission.format" => form.submission_format,
"form.id" => form.id.to_s,
}) do
s3_submission_service = S3SubmissionService.new(
journey: current_context.journey,
form: form,
timestamp: timestamp,
submission_reference: submission_reference,
is_preview: mode.preview?,
)

s3_submission_service.submit
end
end

def deliver_submission_via_email
submission = Submission.create!(
reference: submission_reference,
form_id: form.id,
answers: current_context.answers,
mode: mode,
form_document: form.document_json,
)

SendSubmissionJob.perform_later(submission) do |job|
unless job.successfully_enqueued?
submission.destroy!
message_suffix = ": #{job.enqueue_error&.message}" if job.enqueue_error
raise StandardError, "Failed to enqueue submission for reference #{submission_reference}#{message_suffix}"
TelemetryService.trace("form.submission.deliver_email", attributes: {
"submission.reference" => submission_reference,
"form.id" => form.id.to_s,
}) do
submission = Submission.create!(
reference: submission_reference,
form_id: form.id,
answers: current_context.answers,
mode: mode,
form_document: form.document_json,
)

SendSubmissionJob.perform_later(submission) do |job|
unless job.successfully_enqueued?
submission.destroy!
message_suffix = ": #{job.enqueue_error&.message}" if job.enqueue_error
raise StandardError, "Failed to enqueue submission for reference #{submission_reference}#{message_suffix}"
end
end
end
end
Expand Down
47 changes: 30 additions & 17 deletions app/services/s3_submission_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,36 @@ def submit
raise StandardError, "S3 bucket account ID is not set on the form" if @form.s3_bucket_aws_account_id.nil?
raise StandardError, "S3 bucket region is not set on the form" if @form.s3_bucket_region.nil?

# We send the uploaded files before the submissions CSV so that processors can have automations run when the CSV
# file arrives and the referenced files will already be present
copy_uploaded_files_to_bucket

submission_content, key =
case @form.submission_format
when %w[csv]
[generate_csv_submission, generate_key("form_submission.csv")]
when %w[json]
[generate_json_submission, generate_key("form_submission.json")]
else
raise StandardError, "Unsupported submission format: #{@form.submission_format.inspect}"
end

upload_submission_to_s3(submission_content, key)

delete_uploaded_files_from_our_bucket
file_count = @journey.completed_file_upload_questions.count

TelemetryService.trace("submission.s3.upload", attributes: {
"s3.bucket" => @form.s3_bucket_name,
"s3.region" => @form.s3_bucket_region,
"submission.format" => @form.submission_format.join(","),
"submission.reference" => @submission_reference,
"submission.file_count" => file_count,
}) do |span|
# We send the uploaded files before the submissions CSV so that processors can have automations run when the CSV
# file arrives and the referenced files will already be present
copy_uploaded_files_to_bucket

submission_content, key =
case @form.submission_format
when %w[csv]
[generate_csv_submission, generate_key("form_submission.csv")]
when %w[json]
[generate_json_submission, generate_key("form_submission.json")]
else
raise StandardError, "Unsupported submission format: #{@form.submission_format.inspect}"
end

span.set_attribute("submission.size_bytes", submission_content.bytesize)
span.set_attribute("s3.key", key)

upload_submission_to_s3(submission_content, key)

delete_uploaded_files_from_our_bucket
end
end

private
Expand Down
117 changes: 117 additions & 0 deletions app/services/telemetry_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
class TelemetryService
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use the Rails.event framework? It's pretty new, so we might still have to roll-our-own, but it would at least be consistent with the future direction and maybe in future it would be supported out of the box by OpenTelemetry.

# OpenTelemetry tracing service for adding span attributes
# Follows the same pattern as CloudWatchService and LogEventService
#
# We are heavily using attributes, not events because X-Ray does not support events
# see: https://github.com/aws-observability/aws-otel-collector/issues/821

# Set request-level attributes for journey tracking
# Call from ApplicationController to add form/session context to all spans
def self.set_request_attributes(attrs)
return unless defined?(OpenTelemetry)

# Ensure all values are primitives (string, number, boolean, nil)
sanitized = attrs.compact.transform_values { |v| sanitize_attribute_value(v) }
current_span.add_attributes(sanitized.transform_keys(&:to_s))
rescue StandardError => e
Sentry.capture_exception(e) if defined?(Sentry)
end

# Set question-level attributes on page requests
# Call from PageController to add question context to all page spans
def self.set_question_attributes(step, form)
return unless defined?(OpenTelemetry)

attrs = {
"question.type" => step.question.class.name,
"question.id" => step.page_id,
"question.text" => step.question_text,
"question.answer_type" => step.page&.answer_type,
"question.number" => step.page_number,
"question.is_optional" => step.question.is_optional?,
"question.is_repeatable" => step.repeatable?,
"form.submission_type" => form.submission_type,
}.compact

sanitized = attrs.transform_values { |v| sanitize_attribute_value(v) }
current_span.add_attributes(sanitized)
rescue StandardError => e
Sentry.capture_exception(e) if defined?(Sentry)
end

def self.record_validation_failure(step)
return unless defined?(OpenTelemetry)

attrs = {
"validation.failed" => true,
"validation.error_count" => step.question.errors.count,
"validation.errors" => step.question.errors.full_messages.join(", "),
"validation.error_attributes" => step.question.errors.attribute_names.map(&:to_s).join(", "),
}

sanitized = attrs.transform_values { |v| sanitize_attribute_value(v) }
current_span.add_attributes(sanitized)
rescue StandardError => e
# Silently fail - don't break the app if telemetry has issues
Sentry.capture_exception(e) if defined?(Sentry)
end

def self.record_validation_success
return unless defined?(OpenTelemetry)

current_span.set_attribute("validation.passed", true)
rescue StandardError => e
Sentry.capture_exception(e) if defined?(Sentry)
end

# Create a custom span for wrapping important operations
# Usage: TelemetryService.trace('operation.name', attributes: {...}) { ... }
def self.trace(span_name, attributes: {}, &block)
return yield(NoOpSpan.new) unless defined?(OpenTelemetry)

# Get tracer
tracer = OpenTelemetry.tracer_provider.tracer("forms-runner")

# Sanitize attributes to ensure they're primitives
sanitized = attributes.compact.transform_values { |v| sanitize_attribute_value(v) }

tracer.in_span(span_name, attributes: sanitized, &block)
rescue StandardError => e
Sentry.capture_exception(e) if defined?(Sentry)
# If tracing fails, still execute the block with a no-op span
# This ensures business logic runs even if telemetry breaks
yield(NoOpSpan.new)
end

def self.current_span
OpenTelemetry::Trace.current_span
end
private_class_method :current_span

# Sanitize attribute values to ensure they're primitives (String, Integer, Float, Boolean)
# OpenTelemetry requires attribute values to be primitives, not complex objects
def self.sanitize_attribute_value(value)
case value
when String, Integer, Float, TrueClass, FalseClass, NilClass
value
when Array
value.join(", ")
else
value.to_s
end
end
private_class_method :sanitize_attribute_value

# No-op span that safely ignores all method calls
# Used as a fallback when tracing is disabled or fails
class NoOpSpan
def method_missing(_method_name, *_args, **_kwargs, &_block)
# Silently ignore all method calls (set_attribute, add_event, etc.)
nil
end

def respond_to_missing?(_method_name, _include_private = false)
true
end
end
end
12 changes: 12 additions & 0 deletions config/initializers/opentelemetry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
require "opentelemetry/sdk"
require "opentelemetry/instrumentation/all"

return unless ENV["ENABLE_OTEL"] == "true"

OpenTelemetry::SDK.configure do |c|
instrumentation_config = { "OpenTelemetry::Instrumentation::Rack" => { untraced_endpoints: ["/up"] } }
c.use_all(instrumentation_config)

# Disable logging for Rake tasks to avoid cluttering output
c.logger = Logger.new(File::NULL) if Rails.const_defined?(:Rake) && Rake.application.top_level_tasks.any?
end