Skip to content

Commit b4cc929

Browse files
committed
feat(metrics): implement UTF-8 URL handling and validations for LinkClick and PageView models
1 parent cbd75ba commit b4cc929

File tree

7 files changed

+407
-11
lines changed

7 files changed

+407
-11
lines changed

app/models/better_together/metrics/link_click.rb

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,42 @@
44
module BetterTogether
55
module Metrics
66
class LinkClick < ApplicationRecord # rubocop:todo Style/Documentation
7+
include Utf8UrlHandler
8+
79
# Validations
810
VALID_URL_SCHEMES = %w[http https tel mailto].freeze
911

10-
# Regular expression to match http, https, tel, and mailto URLs
11-
VALID_URL_REGEX = /\A(http|https|tel|mailto):.+\z/
12-
13-
validates :url, presence: true,
14-
format: { with: VALID_URL_REGEX, message: 'must be a valid URL or tel/mailto link' }
15-
validates :page_url, presence: true, format: URI::DEFAULT_PARSER.make_regexp(%w[http https])
12+
validates :url, presence: true
13+
validates :page_url, presence: true
1614
validates :locale, presence: true, inclusion: { in: I18n.available_locales.map(&:to_s) }
1715
validates :clicked_at, presence: true
1816
validates :internal, inclusion: { in: [true, false] }
17+
18+
# Custom validation for UTF-8 URL support
19+
validate :url_must_be_valid
20+
validate :page_url_must_be_valid
21+
22+
private
23+
24+
def url_must_be_valid
25+
return if url.blank?
26+
27+
return if valid_utf8_url?(url)
28+
29+
errors.add(:url, 'must be a valid URL or tel/mailto link')
30+
end
31+
32+
def page_url_must_be_valid
33+
return if page_url.blank?
34+
35+
# For page_url, we're more lenient - it can be a relative path or full URL
36+
uri = safe_parse_uri(page_url)
37+
38+
# If it parses as a URI and either has no scheme (relative) or has http/https scheme
39+
return if uri && (uri.scheme.nil? || %w[http https].include?(uri.scheme&.downcase))
40+
41+
errors.add(:page_url, 'must be a valid URL')
42+
end
1943
end
2044
end
2145
end

app/models/better_together/metrics/page_view.rb

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
module BetterTogether
55
module Metrics
66
class PageView < ApplicationRecord # rubocop:todo Style/Documentation
7+
include Utf8UrlHandler
8+
79
SENSITIVE_QUERY_PARAMS = %w[token password secret].freeze
810

911
belongs_to :pageable, polymorphic: true
@@ -34,11 +36,15 @@ def set_page_url # rubocop:todo Metrics/AbcSize, Metrics/MethodLength
3436

3537
return if url.blank?
3638

37-
uri = URI.parse(url)
38-
@page_url_query = uri.query
39-
self.page_url = uri.path
40-
rescue URI::InvalidURIError
41-
errors.add(:page_url, 'is invalid')
39+
# Use our UTF-8 safe URI parser
40+
uri = safe_parse_uri(url)
41+
if uri
42+
@page_url_query = uri.query
43+
self.page_url = uri.path
44+
else
45+
# If we can't parse it at all, add an error
46+
errors.add(:page_url, 'is invalid')
47+
end
4248
end
4349

4450
def page_url_without_sensitive_parameters
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# frozen_string_literal: true
2+
3+
module BetterTogether
4+
module Metrics
5+
# Helper module for handling UTF-8 URLs in metrics models
6+
module Utf8UrlHandler
7+
extend ActiveSupport::Concern
8+
9+
private
10+
11+
# Parse a URL that may contain UTF-8 characters
12+
# @param url [String] The URL to parse
13+
# @return [URI::Generic, nil] Parsed URI or nil if invalid
14+
def safe_parse_uri(url) # rubocop:todo Metrics/MethodLength
15+
return if url.blank?
16+
17+
# First try with the URL as-is
18+
begin
19+
URI.parse(url)
20+
rescue URI::InvalidURIError
21+
# If that fails, try encoding it
22+
encoded_url = encode_utf8_url(url)
23+
begin
24+
URI.parse(encoded_url)
25+
rescue URI::InvalidURIError
26+
nil
27+
end
28+
end
29+
end
30+
31+
# Encode UTF-8 characters in a URL while preserving the structure
32+
# @param url [String] The URL to encode
33+
# @return [String] URL-encoded string
34+
def encode_utf8_url(url) # rubocop:todo Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
35+
return url if url.blank?
36+
37+
# Split URL into components to avoid encoding the protocol/scheme
38+
if url.match(%r{\A([a-z]+://)}i)
39+
scheme_and_authority, path_and_query = url.split('/', 3)
40+
if path_and_query.present?
41+
encoded_path = path_and_query.split('?').map do |part|
42+
encode_utf8_component(part)
43+
end.join('?')
44+
"#{scheme_and_authority}/#{encoded_path}"
45+
else
46+
# Encode just the host part if no path
47+
parts = url.split('//')
48+
if parts.length > 1
49+
protocol = parts[0]
50+
host_and_rest = parts[1]
51+
"#{protocol}//#{encode_host_component(host_and_rest)}"
52+
else
53+
url
54+
end
55+
end
56+
else
57+
# No protocol, encode the whole thing
58+
encode_utf8_component(url)
59+
end
60+
end
61+
62+
# Encode UTF-8 characters in a URL component
63+
# @param component [String] The URL component to encode
64+
# @return [String] Encoded component
65+
def encode_utf8_component(component)
66+
return component if component.blank?
67+
68+
# Only encode non-ASCII characters
69+
component.gsub(/[^\x00-\x7F]/) { |char| CGI.escape(char) }
70+
end
71+
72+
# Encode UTF-8 characters in a host component (for IDN support)
73+
# @param host_component [String] The host component to encode
74+
# @return [String] Encoded host component
75+
def encode_host_component(host_component) # rubocop:todo Metrics/MethodLength, Metrics/PerceivedComplexity
76+
return host_component if host_component.blank?
77+
78+
# For international domain names, we need special handling
79+
# Split by '/' to separate host from path
80+
parts = host_component.split('/', 2)
81+
host = parts[0]
82+
path = parts[1]
83+
84+
# Convert international domain to punycode if needed
85+
encoded_host = begin
86+
# Try to convert to ASCII using Punycode for IDN support
87+
if host.match?(/[^\x00-\x7F]/)
88+
# For Ruby's built-in IDN support, we'll encode each part
89+
host_parts = host.split('.')
90+
encoded_parts = host_parts.map do |part|
91+
if part.match?(/[^\x00-\x7F]/)
92+
# Simple percent encoding for now
93+
encode_utf8_component(part)
94+
else
95+
part
96+
end
97+
end
98+
encoded_parts.join('.')
99+
else
100+
host
101+
end
102+
rescue StandardError
103+
# Fallback to percent encoding
104+
encode_utf8_component(host)
105+
end
106+
107+
if path
108+
"#{encoded_host}/#{encode_utf8_component(path)}"
109+
else
110+
encoded_host
111+
end
112+
end
113+
114+
# Validate if a URL is structurally valid for our purposes
115+
# @param url [String] The URL to validate
116+
# @return [Boolean] Whether the URL is valid
117+
def valid_utf8_url?(url)
118+
return false if url.blank?
119+
120+
uri = safe_parse_uri(url)
121+
return false unless uri
122+
123+
# Check if it has a valid scheme
124+
return false unless uri.scheme.present?
125+
126+
# For our metrics, we accept http, https, tel, mailto
127+
allowed_schemes = %w[http https tel mailto]
128+
allowed_schemes.include?(uri.scheme.downcase)
129+
end
130+
end
131+
end
132+
end
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# frozen_string_literal: true
2+
3+
require 'rails_helper'
4+
5+
RSpec.describe BetterTogether::Metrics::TrackLinkClickJob do
6+
describe 'UTF-8 URL handling' do
7+
let(:utf8_urls) do
8+
[
9+
'https://例え.テスト', # Japanese IDN
10+
'https://тест.рф', # Cyrillic IDN
11+
'https://example.com/café', # UTF-8 path
12+
'https://example.com/页面', # Chinese characters in path
13+
'https://bücher.example.com/straße', # German umlauts
14+
'https://пример.испытание/тест' # Full Cyrillic URL
15+
]
16+
end
17+
18+
let(:valid_params) do
19+
{
20+
page_url: 'https://example.com/test',
21+
locale: 'en',
22+
internal: false
23+
}
24+
end
25+
26+
it 'creates LinkClick records for UTF-8 URLs without errors' do
27+
utf8_urls.each do |url|
28+
expect do
29+
described_class.perform_now(
30+
url,
31+
valid_params[:page_url],
32+
valid_params[:locale],
33+
valid_params[:internal]
34+
)
35+
end.to change(BetterTogether::Metrics::LinkClick, :count).by(1)
36+
37+
link_click = BetterTogether::Metrics::LinkClick.last
38+
expect(link_click.url).to eq(url)
39+
expect(link_click).to be_valid
40+
end
41+
end
42+
43+
it 'handles UTF-8 page URLs' do
44+
utf8_urls.each do |page_url|
45+
expect do
46+
described_class.perform_now(
47+
'https://example.com/link',
48+
page_url,
49+
'en',
50+
false
51+
)
52+
end.to change(BetterTogether::Metrics::LinkClick, :count).by(1)
53+
54+
link_click = BetterTogether::Metrics::LinkClick.last
55+
expect(link_click.page_url).to eq(page_url)
56+
expect(link_click).to be_valid
57+
end
58+
end
59+
end
60+
end
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# frozen_string_literal: true
2+
3+
require 'rails_helper'
4+
5+
RSpec.describe BetterTogether::Metrics::TrackPageViewJob do
6+
describe 'UTF-8 URL handling' do
7+
let(:person) { create(:better_together_person) }
8+
let(:pageable) { create(:better_together_page, content: 'Test page content') }
9+
10+
let(:utf8_urls) do
11+
[
12+
'https://example.com/café', # UTF-8 path
13+
'https://example.com/页面', # Chinese characters in path
14+
'https://bücher.example.com/straße' # German umlauts
15+
]
16+
end
17+
18+
it 'creates PageView records without errors' do
19+
# Test that the job can create page views successfully
20+
# The UTF-8 handling is tested at the model level
21+
expect do
22+
described_class.perform_now(pageable, 'en')
23+
end.to change(BetterTogether::Metrics::PageView, :count).by(1)
24+
25+
page_view = BetterTogether::Metrics::PageView.last
26+
expect(page_view).to be_valid
27+
expect(page_view.errors[:page_url]).to be_empty
28+
end
29+
30+
it 'handles pageables with UTF-8 URLs' do
31+
# Create a mock pageable that returns UTF-8 URL
32+
utf8_pageable = instance_double(Page)
33+
allow(utf8_pageable).to receive_messages(url: 'https://example.com/café', becomes: utf8_pageable,
34+
class: double(base_class: double)) # rubocop:todo RSpec/VerifiedDoubles
35+
36+
expect do
37+
described_class.perform_now(utf8_pageable, 'en')
38+
end.to change(BetterTogether::Metrics::PageView, :count).by(1)
39+
40+
page_view = BetterTogether::Metrics::PageView.last
41+
expect(page_view.errors[:page_url]).to be_empty
42+
end
43+
end
44+
end
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# frozen_string_literal: true
2+
3+
require 'rails_helper'
4+
5+
RSpec.describe BetterTogether::Metrics::LinkClick do
6+
describe 'UTF-8 URL validation' do
7+
let(:utf8_urls) do
8+
[
9+
'https://例え.テスト', # Japanese IDN
10+
'https://тест.рф', # Cyrillic IDN
11+
'https://example.com/café', # UTF-8 path
12+
'https://example.com/页面', # Chinese characters in path
13+
'https://bücher.example.com/straße', # German umlauts
14+
'https://пример.испытание/тест' # Full Cyrillic URL
15+
]
16+
end
17+
18+
let(:valid_attributes) do
19+
{
20+
page_url: 'https://example.com/test',
21+
locale: 'en',
22+
clicked_at: Time.current,
23+
internal: false
24+
}
25+
end
26+
27+
it 'accepts UTF-8 encoded URLs' do
28+
utf8_urls.each do |url|
29+
link_click = described_class.new(valid_attributes.merge(url: url))
30+
expect(link_click).to be_valid, "URL #{url} should be valid but got errors: #{link_click.errors.full_messages}"
31+
end
32+
end
33+
34+
it 'accepts UTF-8 encoded page URLs' do
35+
utf8_urls.each do |page_url|
36+
link_click = described_class.new(valid_attributes.merge(page_url: page_url, url: 'https://example.com'))
37+
expect(link_click).to be_valid,
38+
"Page URL #{page_url} should be valid but got errors: #{link_click.errors.full_messages}"
39+
end
40+
end
41+
42+
it 'handles URL encoding properly' do
43+
# Test both encoded and unencoded versions
44+
raw_url = 'https://example.com/café'
45+
encoded_url = 'https://example.com/caf%C3%A9'
46+
47+
link_click1 = described_class.new(valid_attributes.merge(url: raw_url))
48+
link_click2 = described_class.new(valid_attributes.merge(url: encoded_url))
49+
50+
expect(link_click1).to be_valid
51+
expect(link_click2).to be_valid
52+
end
53+
end
54+
end

0 commit comments

Comments
 (0)