Fix disappearing end of the comments with emoji

shiny-comic · shiny-comic · commit 490a0b7fec23 · 2026-02-27T07:01:02.000+09:00
Previous code use UTF-8 to count characters however Emojis are UTF-16 units.
This difference leads to misalignment of index offsets.
diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr
@@ -21,14 +21,16 @@ private def copy_string(str : String::Builder, iter : Iterator, count : Int) : I
       str << cp.chr
     end
 
-    # A codepoint from the SMP counts twice
-    copied += 1 if cp > 0xFFFF
     copied += 1
   end
 
   return copied
 end
 
+private def utf16_length(content : String) : Int32
+  content.each_char.sum { |ch| ch.ord >= 0x10000 ? 2 : 1 }
+end
+
 def parse_description(desc, video_id : String) : String?
   return "" if desc.nil?
 
@@ -40,14 +42,10 @@ def parse_description(desc, video_id : String) : String?
     # Slightly faster than HTML.escape, as we're only doing one pass on
     # the string instead of five for the standard library
     return String.build do |str|
-      copy_string(str, content.each_codepoint, content.size)
+      copy_string(str, content.each_codepoint, utf16_length(content))
     end
   end
 
-  # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints
-  # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are
-  # automatically decoded by the JSON parser. It means that we need to count
-  # copied byte in a special manner, preventing the use of regular string copy.
   iter = content.each_codepoint
 
   index = 0
@@ -76,7 +74,7 @@ def parse_description(desc, video_id : String) : String?
     end
 
     # Copy the end of the string (past the last command).
-    remaining_length = content.size - index
+    remaining_length = utf16_length(content) - index
     copy_string(str, iter, remaining_length) if remaining_length > 0
   end
 end