@@ -10,8 +10,6 @@ import scala.util.Using
1010
1111object IOUtils {
1212
13- private val surrogatePattern : Pattern = Pattern .compile(" [^\u0000 -\uffff ]" )
14-
1513 private val boms : Set [Char ] = Set (
1614 '\uefbb ' , // UTF-8
1715 '\ufeff ' , // UTF-16 (BE)
@@ -39,27 +37,10 @@ object IOUtils {
3937 }
4038 }
4139
42- /** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
43- * In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
44- * talk about removing surrogates, which don't exist in the character / code point representation (unless you have
45- * rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
46- * characters which will require surrogates when encoded. That means any character which lies beyond the basic
47- * multilingual plane. You can do that with a simple regular expression.
48- */
49- private def replaceUnpairedSurrogates (input : String ): String = {
50- val matches = surrogatePattern.matcher(input)
51- if (matches.find()) {
52- val size = matches.end() - matches.start()
53- matches.replaceAll(" ?" * size)
54- } else {
55- input
56- }
57- }
58-
5940 private def contentFromBufferedSource (bufferedSource : BufferedSource ): Seq [String ] = {
6041 val reader = bufferedSource.bufferedReader()
6142 skipBOMIfPresent(reader)
62- reader.lines().iterator().asScala.map(replaceUnpairedSurrogates). toSeq
43+ reader.lines().iterator().asScala.toSeq
6344 }
6445
6546 private def contentStringFromBufferedSource (bufferedSource : BufferedSource ): String = {
@@ -78,7 +59,7 @@ object IOUtils {
7859 }
7960 }
8061
81- replaceUnpairedSurrogates( stringBuilder.toString)
62+ stringBuilder.toString
8263 }
8364
8465 /** Reads a file at the given path and:
0 commit comments