[GR-34937] Add specs for TruffleString and update graal import

eregon · eregon · commit b0247d04e7d2 · 2022-05-31T11:14:03.000Z
PullRequest: truffleruby/3371
diff --git a/mx.truffleruby/suite.py b/mx.truffleruby/suite.py
@@ -7,7 +7,7 @@
             {
                 "name": "regex",
                 "subdir": True,
-                "version": "587c31f311b09ba9e398e182b8e3a6bcf832679c",
+                "version": "aeca61acf7e52a19c52c1bd019ab63f158477dfc",
                 "urls": [
                     {"url": "https://github.com/oracle/graal.git", "kind": "git"},
                     {"url": "https://curio.ssw.jku.at/nexus/content/repositories/snapshots", "kind": "binary"},
@@ -16,7 +16,7 @@
             {
                 "name": "sulong",
                 "subdir": True,
-                "version": "587c31f311b09ba9e398e182b8e3a6bcf832679c",
+                "version": "aeca61acf7e52a19c52c1bd019ab63f158477dfc",
                 "urls": [
                     {"url": "https://github.com/oracle/graal.git", "kind": "git"},
                     {"url": "https://curio.ssw.jku.at/nexus/content/repositories/snapshots", "kind": "binary"},
diff --git a/spec/ruby/core/integer/chr_spec.rb b/spec/ruby/core/integer/chr_spec.rb
@@ -223,26 +223,25 @@
 
   # #5864
   it "raises RangeError if self is invalid as a codepoint in the specified encoding" do
-    [ [0x80,   "US-ASCII"],
-      [0x0100, "BINARY"],
-      [0x0100, "EUC-JP"],
-      [0xA1A0, "EUC-JP"],
-      [0xA1,   "EUC-JP"],
-      [0x80,   "SHIFT_JIS"],
-      [0xE0,   "SHIFT_JIS"],
-      [0x0100, "ISO-8859-9"],
-      [620,    "TIS-620"],
-      [0xD800, "UTF-8"],
-      [0xDBFF, "UTF-8"],
-      [0xDC00, "UTF-8"],
-      [0xDFFF, "UTF-8"],
-      [0xD800, "UTF-16"],
-      [0xDBFF, "UTF-16"],
-      [0xDC00, "UTF-16"],
-      [0xDFFF, "UTF-16"],
-    ].each do |integer, encoding_name|
-      -> { integer.chr(encoding_name) }.should raise_error(RangeError)
-    end
+    -> { 0x80.chr("US-ASCII") }.should raise_error(RangeError)
+    -> { 0x0100.chr("BINARY") }.should raise_error(RangeError)
+    -> { 0x0100.chr("EUC-JP") }.should raise_error(RangeError)
+    -> { 0xA1A0.chr("EUC-JP") }.should raise_error(RangeError)
+    -> { 0xA1.chr("EUC-JP") }.should raise_error(RangeError)
+    -> { 0x80.chr("SHIFT_JIS") }.should raise_error(RangeError)
+    -> { 0xE0.chr("SHIFT_JIS") }.should raise_error(RangeError)
+    -> { 0x0100.chr("ISO-8859-9") }.should raise_error(RangeError)
+    -> { 620.chr("TIS-620") }.should raise_error(RangeError)
+    # UTF-16 surrogate range
+    -> { 0xD800.chr("UTF-8") }.should raise_error(RangeError)
+    -> { 0xDBFF.chr("UTF-8") }.should raise_error(RangeError)
+    -> { 0xDC00.chr("UTF-8") }.should raise_error(RangeError)
+    -> { 0xDFFF.chr("UTF-8") }.should raise_error(RangeError)
+    # UTF-16 surrogate range
+    -> { 0xD800.chr("UTF-16") }.should raise_error(RangeError)
+    -> { 0xDBFF.chr("UTF-16") }.should raise_error(RangeError)
+    -> { 0xDC00.chr("UTF-16") }.should raise_error(RangeError)
+    -> { 0xDFFF.chr("UTF-16") }.should raise_error(RangeError)
   end
 
   it 'returns a String encoding self interpreted as a codepoint in the CESU-8 encoding' do
diff --git a/spec/ruby/core/regexp/shared/quote.rb b/spec/ruby/core/regexp/shared/quote.rb
@@ -12,6 +12,11 @@
     Regexp.send(@method, :symbol).should == 'symbol'
   end
 
+  it "works with substrings" do
+    str = ".+[]()"[1...-1]
+    Regexp.send(@method, str).should == '\+\[\]\('
+  end
+
   it "sets the encoding of the result to US-ASCII if there are only US-ASCII characters present in the input String" do
     str = "abc".force_encoding("euc-jp")
     Regexp.send(@method, str).encoding.should == Encoding::US_ASCII
diff --git a/spec/ruby/core/string/capitalize_spec.rb b/spec/ruby/core/string/capitalize_spec.rb
@@ -35,6 +35,10 @@
     it "does not capitalize non-ASCII characters" do
       "ßet".capitalize(:ascii).should == "ßet"
     end
+
+    it "handles non-ASCII substrings properly" do
+      "garçon"[1..-1].capitalize(:ascii).should == "Arçon"
+    end
   end
 
   describe "full Unicode case mapping adapted for Turkic languages" do
diff --git a/spec/ruby/core/string/dup_spec.rb b/spec/ruby/core/string/dup_spec.rb
@@ -49,4 +49,13 @@ class << @obj
     orig.should == "xtring"
     dup.should == "string"
   end
+
+  it "does not modify the original setbyte-mutated string when changing dupped string" do
+    orig = "a"
+    orig.setbyte 0, "b".ord
+    copy = orig.dup
+    orig.setbyte 0, "c".ord
+    orig.should == "c"
+    copy.should == "b"
+  end
 end
diff --git a/spec/ruby/core/string/lstrip_spec.rb b/spec/ruby/core/string/lstrip_spec.rb
@@ -50,4 +50,10 @@
     -> { "hello".freeze.lstrip! }.should raise_error(FrozenError)
     -> { "".freeze.lstrip!      }.should raise_error(FrozenError)
   end
+
+  it "raises an ArgumentError if the first codepoint is invalid" do
+    s = "\xDFabc".force_encoding(Encoding::UTF_8)
+    s.valid_encoding?.should be_false
+    -> { s.lstrip! }.should raise_error(ArgumentError)
+  end
 end
diff --git a/spec/ruby/core/string/rstrip_spec.rb b/spec/ruby/core/string/rstrip_spec.rb
@@ -46,4 +46,10 @@
     -> { "hello".freeze.rstrip! }.should raise_error(FrozenError)
     -> { "".freeze.rstrip!      }.should raise_error(FrozenError)
   end
+
+  it "raises an ArgumentError if the last codepoint is invalid" do
+    s = "abc\xDF".force_encoding(Encoding::UTF_8)
+    s.valid_encoding?.should be_false
+    -> { s.rstrip! }.should raise_error(ArgumentError)
+  end
 end
diff --git a/spec/ruby/core/string/scrub_spec.rb b/spec/ruby/core/string/scrub_spec.rb
@@ -14,6 +14,11 @@
     "abc\u3042#{x81}".scrub.should == "abc\u3042\uFFFD"
   end
 
+  it "replaces invalid byte sequences in lazy substrings" do
+    x81 = [0x81].pack('C').force_encoding('utf-8')
+    "abc\u3042#{x81}def"[1...-1].scrub.should == "bc\u3042\uFFFDde"
+  end
+
   it "returns a copy of self when the input encoding is BINARY" do
     input = "foo".encode('BINARY')
 
diff --git a/spec/ruby/core/string/split_spec.rb b/spec/ruby/core/string/split_spec.rb
@@ -3,12 +3,17 @@
 require_relative 'fixtures/classes'
 
 describe "String#split with String" do
+  it "throws an ArgumentError if the string  is not a valid" do
+    s = "\xDF".force_encoding(Encoding::UTF_8)
+
+    -> { s.split }.should raise_error(ArgumentError)
+    -> { s.split(':') }.should raise_error(ArgumentError)
+  end
+
   it "throws an ArgumentError if the pattern is not a valid string" do
     str = 'проверка'
-    broken_str = 'проверка'
-    broken_str.force_encoding('binary')
-    broken_str.chop!
-    broken_str.force_encoding('utf-8')
+    broken_str = "\xDF".force_encoding(Encoding::UTF_8)
+
     -> { str.split(broken_str) }.should raise_error(ArgumentError)
   end
 
@@ -218,6 +223,12 @@
 end
 
 describe "String#split with Regexp" do
+  it "throws an ArgumentError if the string  is not a valid" do
+    s = "\xDF".force_encoding(Encoding::UTF_8)
+
+    -> { s.split(/./) }.should raise_error(ArgumentError)
+  end
+
   it "divides self on regexp matches" do
     " now's  the time".split(/ /).should == ["", "now's", "", "the", "time"]
     " x\ny ".split(/ /).should == ["", "x\ny"]