Skip to content

Commit a204e9c

Browse files
authored
Use Addressable to heuristically parse invalid URLs and normalize them (#603)
1 parent 594f366 commit a204e9c

File tree

4 files changed

+18
-3
lines changed

4 files changed

+18
-3
lines changed

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ ruby '~> 3.3.1'
77

88
gem 'rails', '~> 8.0.1'
99

10+
gem 'addressable' # More standards-compliant URI parser
1011
gem 'bcrypt' # Use Active Model has_secure_password
1112
gem 'bootsnap', require: false # Reduces boot times through caching; required in config/boot.rb
1213
gem 'feedjira' # Parse RSS feeds

Gemfile.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ PLATFORMS
372372
ruby
373373

374374
DEPENDENCIES
375+
addressable
375376
annotaterb
376377
bcrypt
377378
bootsnap

app/models/entry.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#
2828
# fk_rails_... (subscription_id => subscriptions.id)
2929
#
30+
require 'addressable/uri'
31+
3032
class Entry < ApplicationRecord
3133
FEEDJIRA_KEYS_MAP = {
3234
author: :author,
@@ -72,8 +74,7 @@ def read?
7274
end
7375

7476
def normalize_url(input)
75-
# Some urls might contain spaces, so we replace these
76-
uri = URI(input.gsub(' ', '%20'))
77+
uri = Addressable::URI.heuristic_parse(input).normalize
7778
# Some entries might contain absolute/relative path to the page they were on
7879
uri = URI(url).merge(uri) if url.present?
7980
uri.to_s

test/models/entry_test.rb

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,22 @@ class EntryTest < ActiveSupport::TestCase
7171
end
7272

7373
# Methods
74-
test 'should be able to normalize urls found in post' do
74+
test 'should be able to normalize urls found in post when containing spaces' do
7575
entry = build(:entry, url: 'https://example.com/posts/first.html')
7676

7777
assert_equal 'https://example.com/image%201.jpg', entry.normalize_url('https://example.com/image 1.jpg')
78+
end
79+
80+
test 'should be able to normalize urls found in post when containing unicode' do
81+
entry = build(:entry, url: 'https://example.com/posts/first.html')
82+
83+
assert_equal 'https://example.com/image%F0%9F%96%A41.jpg', entry.normalize_url('https://example.com/image🖤1.jpg')
84+
assert_equal 'https://example.com/image%E2%80%941.jpg', entry.normalize_url('https://example.com/image—1.jpg')
85+
end
86+
87+
test 'should be able to normalize urls found in post when missing host' do
88+
entry = build(:entry, url: 'https://example.com/posts/first.html')
89+
7890
assert_equal 'https://example.com/image.jpg', entry.normalize_url('/image.jpg')
7991
end
8092
end

0 commit comments

Comments
 (0)