Skip to content

Commit d6465a4

Browse files
committed
[GR-19220] Implement rb_enc_strlen (#2707)
PullRequest: truffleruby/3464
2 parents 00454d0 + f4a19e0 commit d6465a4

File tree

6 files changed

+79
-3
lines changed

6 files changed

+79
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Compatibility:
2828
* Fix `Regexp.new` to coerce non-String arguments (#2705, @andrykonchin).
2929
* Fix `Kernel#sprintf` formatting for `%c` when used non-ASCII encoding (#2369, @andrykonchin).
3030
* Fix `Kernel#sprintf` argument casting for `%c` (@andrykonchin).
31+
* Implement the `rb_enc_strlen` function for use by native extensions (@nirvdrum).
3132

3233
Performance:
3334

lib/cext/ABI_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8
1+
9

spec/ruby/optional/capi/encoding_spec.rb

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,48 @@
6363
end
6464
end
6565

66+
describe "rb_enc_strlen" do
67+
before :each do
68+
@str = 'こにちわ' # Each codepoint in this string is 3 bytes in UTF-8
69+
end
70+
71+
it "returns the correct string length for the encoding" do
72+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_8).should == 4
73+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::BINARY).should == 12
74+
end
75+
76+
it "returns the string length based on a fixed-width encoding's character length, even if the encoding is incompatible" do
77+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16BE).should == 6
78+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16LE).should == 6
79+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32BE).should == 3
80+
@s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32LE).should == 3
81+
end
82+
83+
it "does not consider strings to be NUL-terminated" do
84+
s = "abc\0def"
85+
@s.rb_enc_strlen(s, s.bytesize, Encoding::US_ASCII).should == 7
86+
@s.rb_enc_strlen(s, s.bytesize, Encoding::UTF_8).should == 7
87+
end
88+
89+
describe "handles broken strings" do
90+
it "combines valid character and invalid character counts in UTF-8" do
91+
# The result is 3 because `rb_enc_strlen` counts the first valid character and then adds
92+
# the byte count for the invalid character that follows for 1 + 2.
93+
@s.rb_enc_strlen(@str, 5, Encoding::UTF_8).should == 3
94+
end
95+
96+
it "combines valid character and invalid character counts in UTF-16" do
97+
@s.rb_enc_strlen(@str, 5, Encoding::UTF_16BE).should == 3
98+
end
99+
100+
it "rounds up for fixed-width encodings" do
101+
@s.rb_enc_strlen(@str, 7, Encoding::UTF_32BE).should == 2
102+
@s.rb_enc_strlen(@str, 7, Encoding::UTF_32LE).should == 2
103+
@s.rb_enc_strlen(@str, 5, Encoding::BINARY).should == 5
104+
end
105+
end
106+
end
107+
66108
describe "rb_enc_find" do
67109
it "returns the encoding of an Encoding" do
68110
@s.rb_enc_find("UTF-8").should == "UTF-8"

spec/ruby/optional/capi/ext/encoding_spec.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,14 @@ static VALUE encoding_spec_rb_enc_codelen(VALUE self, VALUE code, VALUE encoding
301301
return INT2FIX(rb_enc_codelen(c, enc));
302302
}
303303

304+
static VALUE encoding_spec_rb_enc_strlen(VALUE self, VALUE str, VALUE length, VALUE encoding) {
305+
int l = FIX2INT(length);
306+
char *p = RSTRING_PTR(str);
307+
char *e = p + l;
308+
309+
return LONG2FIX(rb_enc_strlen(p, e, rb_to_encoding(encoding)));
310+
}
311+
304312
void Init_encoding_spec(void) {
305313
VALUE cls;
306314
native_rb_encoding_pointer = (rb_encoding**) malloc(sizeof(rb_encoding*));
@@ -335,6 +343,7 @@ void Init_encoding_spec(void) {
335343
rb_define_method(cls, "rb_enc_compatible", encoding_spec_rb_enc_compatible, 2);
336344
rb_define_method(cls, "rb_enc_copy", encoding_spec_rb_enc_copy, 2);
337345
rb_define_method(cls, "rb_enc_codelen", encoding_spec_rb_enc_codelen, 2);
346+
rb_define_method(cls, "rb_enc_strlen", encoding_spec_rb_enc_strlen, 3);
338347
rb_define_method(cls, "rb_enc_find", encoding_spec_rb_enc_find, 1);
339348
rb_define_method(cls, "rb_enc_find_index", encoding_spec_rb_enc_find_index, 1);
340349
rb_define_method(cls, "rb_enc_isalnum", encoding_spec_rb_enc_isalnum, 2);

src/main/c/cext/encoding.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,17 @@ int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) {
246246
return polyglot_as_i32(RUBY_CEXT_INVOKE_NO_WRAP("rb_enc_precise_mbclen", rb_tr_temporary_native_string(p, length, enc)));
247247
}
248248

249+
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) {
250+
long length = e - p;
251+
int minlen = rb_enc_mbminlen(enc);
252+
253+
if (minlen == rb_enc_mbmaxlen(enc)) {
254+
return length / minlen + !!(length % minlen);
255+
}
256+
257+
return polyglot_as_i64(RUBY_CEXT_INVOKE_NO_WRAP("rb_enc_strlen", rb_tr_temporary_native_string(p, length, enc)));
258+
}
259+
249260
int rb_enc_dummy_p(rb_encoding *enc) {
250261
return polyglot_as_i32(RUBY_INVOKE_NO_WRAP(rb_enc_from_encoding(enc), "dummy?"));
251262
}
@@ -433,5 +444,3 @@ int enc_is_unicode(const OnigEncodingType *enc) {
433444
const char *name = rb_enc_name(enc);
434445
return !strncmp(name,"UTF", 3);
435446
}
436-
437-

src/main/java/org/truffleruby/cext/CExtNodes.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,21 @@ protected int rbEncPreciseMbclen(Object string,
14811481
}
14821482
}
14831483

1484+
@CoreMethod(names = "rb_enc_strlen", onSingleton = true, required = 1)
1485+
public abstract static class RbEncStrlen extends CoreMethodArrayArgumentsNode {
1486+
1487+
@Specialization(guards = "strings.isRubyString(string)", limit = "1")
1488+
protected int rbEncStrlen(Object string,
1489+
@Cached RubyStringLibrary strings,
1490+
@Cached TruffleString.CodePointLengthNode codePointLengthNode) {
1491+
var tstring = strings.getTString(string);
1492+
var tencoding = strings.getTEncoding(string);
1493+
1494+
return codePointLengthNode.execute(tstring, tencoding);
1495+
}
1496+
1497+
}
1498+
14841499
@CoreMethod(names = "rb_enc_left_char_head", onSingleton = true, required = 3, lowerFixnum = 3)
14851500
public abstract static class RbEncLeftCharHeadNode extends CoreMethodArrayArgumentsNode {
14861501

0 commit comments

Comments
 (0)