Skip to content

Commit 8384d5a

Browse files
chore(internal): protect SSE parsing pipeline from broken UTF-8 characters
1 parent 4672331 commit 8384d5a

File tree

5 files changed

+75
-11
lines changed

5 files changed

+75
-11
lines changed

lib/openai/internal/transport/pooled_net_requester.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def execute(request)
149149
break if finished
150150

151151
rsp.read_body do |bytes|
152-
y << bytes
152+
y << bytes.force_encoding(Encoding::BINARY)
153153
break if finished
154154

155155
self.class.calibrate_socket_timeout(conn, deadline)

lib/openai/internal/util.rb

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ def initialize(src, &blk)
448448
else
449449
src
450450
end
451-
@buf = String.new.b
451+
@buf = String.new
452452
@blk = blk
453453
end
454454
end
@@ -460,7 +460,7 @@ class << self
460460
# @return [Enumerable<String>]
461461
def writable_enum(&blk)
462462
Enumerator.new do |y|
463-
buf = String.new.b
463+
buf = String.new
464464
y.define_singleton_method(:write) do
465465
self << buf.replace(_1)
466466
buf.bytesize
@@ -582,14 +582,35 @@ def encode_content(headers, body)
582582

583583
# @api private
584584
#
585+
# https://www.iana.org/assignments/character-sets/character-sets.xhtml
586+
#
587+
# @param content_type [String]
588+
# @param text [String]
589+
def force_charset!(content_type, text:)
590+
charset = /charset=([^;\s]+)/.match(content_type)&.captures&.first
591+
592+
return unless charset
593+
594+
begin
595+
encoding = Encoding.find(charset)
596+
text.force_encoding(encoding)
597+
rescue ArgumentError
598+
nil
599+
end
600+
end
601+
602+
# @api private
603+
#
604+
# Assumes each chunk in stream has `Encoding::BINARY`.
605+
#
585606
# @param headers [Hash{String=>String}, Net::HTTPHeader]
586607
# @param stream [Enumerable<String>]
587608
# @param suppress_error [Boolean]
588609
#
589610
# @raise [JSON::ParserError]
590611
# @return [Object]
591612
def decode_content(headers, stream:, suppress_error: false)
592-
case headers["content-type"]
613+
case (content_type = headers["content-type"])
593614
in %r{^application/(?:vnd\.api\+)?json}
594615
json = stream.to_a.join
595616
begin
@@ -606,11 +627,10 @@ def decode_content(headers, stream:, suppress_error: false)
606627
in %r{^text/event-stream}
607628
lines = decode_lines(stream)
608629
decode_sse(lines)
609-
in %r{^text/}
610-
stream.to_a.join
611630
else
612-
# TODO: parsing other response types
613-
StringIO.new(stream.to_a.join)
631+
text = stream.to_a.join
632+
force_charset!(content_type, text: text)
633+
StringIO.new(text)
614634
end
615635
end
616636
end
@@ -675,12 +695,17 @@ def chain_fused(enum, &blk)
675695
class << self
676696
# @api private
677697
#
698+
# Assumes Strings have been forced into having `Encoding::BINARY`.
699+
#
700+
# This decoder is responsible for reassembling lines split across multiple
701+
# fragments.
702+
#
678703
# @param enum [Enumerable<String>]
679704
#
680705
# @return [Enumerable<String>]
681706
def decode_lines(enum)
682707
re = /(\r\n|\r|\n)/
683-
buffer = String.new.b
708+
buffer = String.new
684709
cr_seen = nil
685710

686711
chain_fused(enum) do |y|
@@ -711,6 +736,8 @@ def decode_lines(enum)
711736
#
712737
# https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
713738
#
739+
# Assumes that `lines` has been decoded with `#decode_lines`.
740+
#
714741
# @param lines [Enumerable<String>]
715742
#
716743
# @return [Enumerable<Hash{Symbol=>Object}>]
@@ -734,7 +761,7 @@ def decode_sse(lines)
734761
in "event"
735762
current.merge!(event: value)
736763
in "data"
737-
(current[:data] ||= String.new.b) << (value << "\n")
764+
(current[:data] ||= String.new) << (value << "\n")
738765
in "id" unless value.include?("\0")
739766
current.merge!(id: value)
740767
in "retry" if /^\d+$/ =~ value

rbi/lib/openai/internal/util.rbi

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,14 @@ module OpenAI
215215
def encode_content(headers, body); end
216216

217217
# @api private
218+
#
219+
# https://www.iana.org/assignments/character-sets/character-sets.xhtml
220+
sig { params(content_type: String, text: String).void }
221+
def force_charset!(content_type, text:); end
222+
223+
# @api private
224+
#
225+
# Assumes each chunk in stream has `Encoding::BINARY`.
218226
sig do
219227
params(
220228
headers: T.any(T::Hash[String, String], Net::HTTPHeader),
@@ -263,12 +271,19 @@ module OpenAI
263271

264272
class << self
265273
# @api private
274+
#
275+
# Assumes Strings have been forced into having `Encoding::BINARY`.
276+
#
277+
# This decoder is responsible for reassembling lines split across multiple
278+
# fragments.
266279
sig { params(enum: T::Enumerable[String]).returns(T::Enumerable[String]) }
267280
def decode_lines(enum); end
268281

269282
# @api private
270283
#
271284
# https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
285+
#
286+
# Assumes that `lines` has been decoded with `#decode_lines`.
272287
sig do
273288
params(lines: T::Enumerable[String]).returns(T::Enumerable[OpenAI::Internal::Util::ServerSentEvent])
274289
end

sig/openai/internal/util.rbs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ module OpenAI
120120
top body
121121
) -> top
122122

123+
def self?.force_charset!: (String content_type, text: String) -> void
124+
123125
def self?.decode_content: (
124126
::Hash[String, String] headers,
125127
stream: Enumerable[String],

test/openai/internal/util_test.rb

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,24 @@ def test_close_fused_sse_chain
368368
end
369369
end
370370

371+
class OpenAI::Test::UtilContentDecodingTest < Minitest::Test
372+
def test_charset
373+
cases = {
374+
"application/json" => Encoding::BINARY,
375+
"application/json; charset=utf-8" => Encoding::UTF_8,
376+
"charset=uTf-8 application/json; " => Encoding::UTF_8,
377+
"charset=UTF-8; application/json; " => Encoding::UTF_8,
378+
"charset=ISO-8859-1 ;application/json; " => Encoding::ISO_8859_1,
379+
"charset=EUC-KR ;application/json; " => Encoding::EUC_KR
380+
}
381+
text = String.new.force_encoding(Encoding::BINARY)
382+
cases.each do |content_type, encoding|
383+
OpenAI::Internal::Util.force_charset!(content_type, text: text)
384+
assert_equal(encoding, text.encoding)
385+
end
386+
end
387+
end
388+
371389
class OpenAI::Test::UtilSseTest < Minitest::Test
372390
def test_decode_lines
373391
cases = {
@@ -381,7 +399,9 @@ def test_decode_lines
381399
%W[\na b\n\n] => %W[\n ab\n \n],
382400
%W[\na b] => %W[\n ab],
383401
%W[\u1F62E\u200D\u1F4A8] => %W[\u1F62E\u200D\u1F4A8],
384-
%W[\u1F62E \u200D \u1F4A8] => %W[\u1F62E\u200D\u1F4A8]
402+
%W[\u1F62E \u200D \u1F4A8] => %W[\u1F62E\u200D\u1F4A8],
403+
["\xf0\x9f".b, "\xa5\xba".b] => ["\xf0\x9f\xa5\xba".b],
404+
["\xf0".b, "\x9f".b, "\xa5".b, "\xba".b] => ["\xf0\x9f\xa5\xba".b]
385405
}
386406
eols = %W[\n \r \r\n]
387407
cases.each do |enum, expected|

0 commit comments

Comments
 (0)