@@ -54,7 +54,7 @@ module OAI
54
54
# <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
55
55
56
56
class Client
57
-
57
+ UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos| \# \d +);)/
58
58
# The constructor which must be passed a valid base url for an oai
59
59
# service:
60
60
#
@@ -198,20 +198,25 @@ def list_sets(opts={})
198
198
do_resumable ( OAI ::ListSetsResponse , 'ListSets' , opts )
199
199
end
200
200
201
- private
202
-
203
- def do_request ( verb , opts = nil )
204
- # fire off the request and return appropriate DOM object
205
- uri = build_uri ( verb , opts )
206
- xml = strip_invalid_utf_8_chars ( get ( uri ) )
201
+ def sanitize_xml ( xml )
202
+ xml = strip_invalid_utf_8_chars ( xml )
203
+ xml = strip_invalid_xml_chars ( xml )
207
204
if @parser == 'libxml'
208
205
# remove default namespace for oai-pmh since libxml
209
206
# isn't able to use our xpaths to get at them
210
207
# if you know a way around thins please let me know
211
208
xml = xml . gsub (
212
209
/xmlns=\" http:\/ \/ www.openarchives.org\/ OAI\/ .\. .\/ \" / , '' )
213
210
end
214
- return load_document ( xml )
211
+ xml
212
+ end
213
+
214
+ private
215
+
216
+ def do_request ( verb , opts = nil )
217
+ # fire off the request and return appropriate DOM object
218
+ uri = build_uri ( verb , opts )
219
+ return load_document ( get ( uri ) )
215
220
end
216
221
217
222
def do_resumable ( responseClass , verb , opts )
@@ -241,6 +246,7 @@ def encode(value)
241
246
end
242
247
243
248
def load_document ( xml )
249
+ xml = sanitize_xml ( xml )
244
250
case @parser
245
251
when 'libxml'
246
252
begin
@@ -251,7 +257,6 @@ def load_document(xml)
251
257
end
252
258
when 'rexml'
253
259
begin
254
- xml = strip_invalid_xml_chars ( xml )
255
260
return REXML ::Document . new ( xml )
256
261
rescue REXML ::ParseException => e
257
262
raise OAI ::Exception , 'response not well formed XML: ' +e . message , caller
@@ -356,17 +361,8 @@ def strip_invalid_utf_8_chars(xml)
356
361
end
357
362
358
363
def strip_invalid_xml_chars ( xml )
359
- invalid = false
360
-
361
- begin
362
- REXML ::Document . new ( xml )
363
- rescue REXML ::ParseException => e
364
- invalid = true
365
- end
366
-
367
- return xml . gsub! ( /&(?!(?:amp|lt|gt|quot|apos);)/ , '&' ) if invalid
368
- return xml
364
+ return xml unless xml =~ UNESCAPED_AMPERSAND
365
+ xml . gsub ( UNESCAPED_AMPERSAND , '&' )
369
366
end
370
-
371
367
end
372
368
end
0 commit comments