@@ -21,14 +21,16 @@ private def copy_string(str : String::Builder, iter : Iterator, count : Int) : I
2121 str << cp.chr
2222 end
2323
24- # A codepoint from the SMP counts twice
25- copied += 1 if cp > 0xFFFF
2624 copied += 1
2725 end
2826
2927 return copied
3028end
3129
30+ private def utf16_length (content : String ) : Int32
31+ content.each_char.sum { |ch | ch.ord >= 0x10000 ? 2 : 1 }
32+ end
33+
3234def parse_description (desc, video_id : String ) : String ?
3335 return " " if desc.nil?
3436
@@ -40,14 +42,10 @@ def parse_description(desc, video_id : String) : String?
4042 # Slightly faster than HTML.escape, as we're only doing one pass on
4143 # the string instead of five for the standard library
4244 return String .build do |str |
43- copy_string(str, content.each_codepoint, content.size )
45+ copy_string(str, content.each_codepoint, utf16_length( content) )
4446 end
4547 end
4648
47- # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints
48- # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are
49- # automatically decoded by the JSON parser. It means that we need to count
50- # copied byte in a special manner, preventing the use of regular string copy.
5149 iter = content.each_codepoint
5250
5351 index = 0
@@ -76,7 +74,7 @@ def parse_description(desc, video_id : String) : String?
7674 end
7775
7876 # Copy the end of the string (past the last command).
79- remaining_length = content.size - index
77+ remaining_length = utf16_length( content) - index
8078 copy_string(str, iter, remaining_length) if remaining_length > 0
8179 end
8280end
0 commit comments