Skip to content

Commit 651bae4

Browse files
committed
generalize and test sanitization of unescaped ampersands in OAI responses
1 parent d436705 commit 651bae4

File tree

2 files changed

+24
-21
lines changed

2 files changed

+24
-21
lines changed

lib/oai/client.rb

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ module OAI
5454
# <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
5555

5656
class Client
57-
57+
UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos|\#\d+);)/
5858
# The constructor which must be passed a valid base url for an oai
5959
# service:
6060
#
@@ -198,20 +198,25 @@ def list_sets(opts={})
198198
do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
199199
end
200200

201-
private
202-
203-
def do_request(verb, opts = nil)
204-
# fire off the request and return appropriate DOM object
205-
uri = build_uri(verb, opts)
206-
xml = strip_invalid_utf_8_chars(get(uri))
201+
def sanitize_xml(xml)
202+
xml = strip_invalid_utf_8_chars(xml)
203+
xml = strip_invalid_xml_chars(xml)
207204
if @parser == 'libxml'
208205
# remove default namespace for oai-pmh since libxml
209206
# isn't able to use our xpaths to get at them
210207
# if you know a way around thins please let me know
211208
xml = xml.gsub(
212209
/xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
213210
end
214-
return load_document(xml)
211+
xml
212+
end
213+
214+
private
215+
216+
def do_request(verb, opts = nil)
217+
# fire off the request and return appropriate DOM object
218+
uri = build_uri(verb, opts)
219+
return load_document(get(uri))
215220
end
216221

217222
def do_resumable(responseClass, verb, opts)
@@ -241,6 +246,7 @@ def encode(value)
241246
end
242247

243248
def load_document(xml)
249+
xml = sanitize_xml(xml)
244250
case @parser
245251
when 'libxml'
246252
begin
@@ -251,7 +257,6 @@ def load_document(xml)
251257
end
252258
when 'rexml'
253259
begin
254-
xml = strip_invalid_xml_chars(xml)
255260
return REXML::Document.new(xml)
256261
rescue REXML::ParseException => e
257262
raise OAI::Exception, 'response not well formed XML: '+e.message, caller
@@ -356,17 +361,8 @@ def strip_invalid_utf_8_chars(xml)
356361
end
357362

358363
def strip_invalid_xml_chars(xml)
359-
invalid = false
360-
361-
begin
362-
REXML::Document.new(xml)
363-
rescue REXML::ParseException => e
364-
invalid = true
365-
end
366-
367-
return xml.gsub!(/&(?!(?:amp|lt|gt|quot|apos);)/, '&amp;') if invalid
368-
return xml
364+
return xml unless xml =~ UNESCAPED_AMPERSAND
365+
xml.gsub(UNESCAPED_AMPERSAND, '&amp;')
369366
end
370-
371367
end
372368
end

test/client/tc_utf8_escaping.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
require 'test_helper_client'
22

33
class UTF8Test < Test::Unit::TestCase
4+
def client
5+
@client ||= OAI::Client.new 'http://localhost:3333/oai'
6+
end
47

58
def test_escaping_invalid_utf_8_characters
6-
client = OAI::Client.new 'http://localhost:3333/oai' #, :parser => 'libxml'
79
invalid_utf_8 = [2, 3, 4, 104, 5, 101, 6, 108, 66897, 108, 66535, 111, 1114112, 33, 55234123, 33].pack("U*")
810
invalid_utf_8 = invalid_utf_8.force_encoding("binary") if invalid_utf_8.respond_to? :force_encoding
911
assert_equal("hello!!", client.send(:strip_invalid_utf_8_chars, invalid_utf_8).gsub(/\?/, ''))
1012
end
1113

14+
def test_unescaped_ampersand_content_correction
15+
src = '<test>Frankie & Johnny <character>&#9829;</character></test>'
16+
expected = '<test>Frankie &amp; Johnny <character>&#9829;</character></test>'
17+
assert_equal(expected, client.sanitize_xml(src))
18+
end
1219
end

0 commit comments

Comments
 (0)