Skip to content

Commit 23f2b07

Browse files
committed
Merge pull request #310 from thirdwing/master
Add encoding in Rcpp::String class
2 parents ed68d86 + e0718b4 commit 23f2b07

File tree

3 files changed

+120
-26
lines changed

3 files changed

+120
-26
lines changed

inst/include/Rcpp/String.h

Lines changed: 82 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -53,54 +53,74 @@ namespace Rcpp {
5353
typedef internal::const_string_proxy<STRSXP> const_StringProxy;
5454

5555
/** default constructor */
56-
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true) {
56+
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true), enc(CE_NATIVE) {
5757
RCPP_STRING_DEBUG( "String()" ) ;
5858
}
5959

6060
/** copy constructor */
61-
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false) {
61+
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false), enc(Rf_getCharCE(other.get_sexp())) {
62+
RCPP_STRING_DEBUG( "String(const String&)" ) ;
63+
}
64+
65+
String( const String& other, const std::string& enc) : data( other.get_sexp()), valid(true), buffer_ready(false) {
66+
set_encoding(enc);
6267
RCPP_STRING_DEBUG( "String(const String&)" ) ;
6368
}
6469

6570
/** construct a string from a single CHARSXP SEXP */
66-
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false) {
71+
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false), enc(Rf_getCharCE(charsxp)) {
72+
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
73+
}
74+
75+
String(SEXP charsxp, const std::string& enc) : data(charsxp), valid(true), buffer_ready(false) {
76+
set_encoding(enc);
6777
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
6878
}
6979

7080
/** from string proxy */
71-
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
81+
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false), enc(Rf_getCharCE(proxy.get())){
82+
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
83+
}
84+
85+
String( const StringProxy& proxy, const std::string& enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
86+
set_encoding(enc);
7287
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
7388
}
89+
7490
/** from string proxy */
75-
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
91+
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false), enc(Rf_getCharCE(proxy.get())){
92+
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
93+
}
94+
95+
String( const const_StringProxy& proxy, const std::string& enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
96+
set_encoding(enc);
7697
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
7798
}
7899

79100
/** from a std::string */
80-
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true) {
101+
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE) {
81102
RCPP_STRING_DEBUG( "String(const std::string& )" ) ;
82103
}
83104

84-
String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
105+
String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) {
85106
RCPP_STRING_DEBUG( "String(const std::wstring& )" ) ;
86107
}
87108

88109
/** from a const char* */
89-
String( const char* s) : buffer(s), valid(false), buffer_ready(true){
110+
String( const char* s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE){
90111
RCPP_STRING_DEBUG( "String(const char*)" ) ;
91112
}
92113

93-
String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
114+
String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) {
94115
RCPP_STRING_DEBUG( "String(const wchar_t* s)" ) ;
95116
}
96117

97-
98118
/** constructors from R primitives */
99-
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false) {}
100-
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
101-
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false){}
102-
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false){}
103-
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
119+
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
120+
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
121+
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false), enc(CE_NATIVE){}
122+
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false), enc(CE_NATIVE){}
123+
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
104124

105125

106126
inline String& operator=( int x ){ data = internal::r_coerce<INTSXP ,STRSXP>( x ) ; valid = true ; buffer_ready = false ; return *this ; }
@@ -127,13 +147,13 @@ namespace Rcpp {
127147
inline String& operator=( const std::wstring& s){ return assign_wide_string(s) ; }
128148
inline String& operator=( const wchar_t* s){ return assign_wide_string(s) ; }
129149

130-
131150
inline String& operator+=( const std::string& s){
132151
RCPP_STRING_DEBUG( "String::operator+=( std::string )" ) ;
133152
if( is_na() ) return *this ;
134153
setBuffer() ; buffer += s ; valid = false ;
135154
return *this ;
136155
}
156+
137157
inline String& operator+=( const char* s){
138158
RCPP_STRING_DEBUG( "String::operator+=( const char*)" ) ;
139159
if( is_na() ) return *this ;
@@ -157,8 +177,8 @@ namespace Rcpp {
157177

158178
public:
159179

160-
inline String& operator+=( const std::wstring& s){ return append_wide_string( s ); }
161-
inline String& operator+=( const wchar_t* s){ return append_wide_string( s ); }
180+
inline String& operator+=( const std::wstring& s){ return append_wide_string( s ); }
181+
inline String& operator+=( const wchar_t* s){ return append_wide_string( s ); }
162182

163183
inline String& operator+=( const String& other ){
164184
RCPP_STRING_DEBUG( "String::operator+=( const char*)" ) ;
@@ -214,8 +234,6 @@ namespace Rcpp {
214234
return replace_first( s.get_cstring(), news.get_cstring() ) ;
215235
}
216236

217-
218-
219237
inline String& replace_last( const char* s, const char* news ){
220238
RCPP_STRING_DEBUG_2( "String::replace_last( const char* = '%s' , const char* = '%s')", s, news ) ;
221239
if( is_na() ) return *this ;
@@ -312,7 +330,7 @@ namespace Rcpp {
312330

313331
inline SEXP get_sexp() const {
314332
RCPP_STRING_DEBUG_1( "String::get_sexp const ( valid = %d) ", valid ) ;
315-
return valid ? data : Rf_mkChar( buffer.c_str() ) ;
333+
return valid ? data : Rf_mkCharCE( buffer.c_str(), enc ) ;
316334
}
317335

318336
inline SEXP get_sexp() {
@@ -329,11 +347,46 @@ namespace Rcpp {
329347
return std::wstring( s, s + strlen(s) );
330348
}
331349

332-
333350
inline const char* get_cstring() const {
334351
return buffer_ready ? buffer.c_str() : CHAR(data) ;
335352
}
336353

354+
inline const std::string get_encoding() const {
355+
switch (enc) {
356+
case CE_BYTES:
357+
return "bytes";
358+
case CE_LATIN1:
359+
return "latin1";
360+
case CE_UTF8:
361+
return "UTF-8";
362+
default:
363+
return "unknown";
364+
}
365+
}
366+
367+
inline void set_encoding( cetype_t encoding ) {
368+
enc = encoding;
369+
370+
if (valid) {
371+
data = Rf_mkCharCE(Rf_translateCharUTF8(data), encoding);
372+
} else {
373+
data = Rf_mkCharCE(buffer.c_str(), encoding) ;
374+
valid = true ;
375+
}
376+
}
377+
378+
inline void set_encoding(const std::string & encoding) {
379+
if ( encoding == "bytes" ) {
380+
set_encoding( CE_BYTES );
381+
} else if ( encoding == "latin1" ) {
382+
set_encoding( CE_LATIN1 );
383+
} else if ( encoding == "UTF-8" ) {
384+
set_encoding( CE_UTF8 );
385+
} else {
386+
set_encoding( CE_ANY );
387+
}
388+
}
389+
337390
bool operator<( const Rcpp::String& other ) const {
338391
return strcmp( get_cstring(), other.get_cstring() ) < 0;
339392
}
@@ -363,6 +416,9 @@ namespace Rcpp {
363416
/** is the buffer initialized */
364417
bool buffer_ready ;
365418

419+
/** the encoding of encapsulated CHARSXP */
420+
cetype_t enc;
421+
366422
inline bool is_na() const { return data == NA_STRING ; }
367423
inline void setBuffer(){
368424
if( !buffer_ready){
@@ -373,7 +429,7 @@ namespace Rcpp {
373429
inline void setData(){
374430
RCPP_STRING_DEBUG( "setData" ) ;
375431
if(!valid) {
376-
data = Rf_mkChar(buffer.c_str()) ;
432+
data = Rf_mkCharCE(buffer.c_str(), enc) ;
377433
valid = true ;
378434
}
379435
}
@@ -403,8 +459,8 @@ namespace Rcpp {
403459
return s.get_sexp() ;
404460
}
405461

406-
template <int RTYPE>
407-
template <typename T>
462+
template <int RTYPE>
463+
template <typename T>
408464
string_proxy<RTYPE>& string_proxy<RTYPE>::operator+=(const T& rhs) {
409465
String tmp = get() ;
410466
tmp += rhs ;
@@ -415,7 +471,7 @@ namespace Rcpp {
415471
}
416472

417473

418-
template <>
474+
template <>
419475
inline SEXP wrap<Rcpp::String>( const Rcpp::String& object) {
420476
RCPP_STRING_DEBUG( "wrap<String>()" ) ;
421477
Shield<SEXP> res( Rf_allocVector( STRSXP, 1 ) ) ;

inst/unitTests/cpp/String.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,29 @@ String test_push_front(String x) {
6161
x.push_front("abc");
6262
return x;
6363
}
64+
65+
// [[Rcpp::export]]
66+
String test_String_encoding(String x) {
67+
return x.get_encoding();
68+
}
69+
70+
// [[Rcpp::export]]
71+
String test_String_set_encoding(String x) {
72+
x.set_encoding("UTF-8");
73+
return x;
74+
}
75+
76+
// [[Rcpp::export]]
77+
String test_String_ctor_encoding(String x) {
78+
String y(x);
79+
y.set_encoding("UTF-8");
80+
return y;
81+
}
82+
83+
84+
// [[Rcpp::export]]
85+
String test_String_ctor_encoding2() {
86+
String y("å");
87+
y.set_encoding("UTF-8");
88+
return y;
89+
}

inst/unitTests/runit.String.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,16 @@ if (.runThisTest) {
5454
res <- test_push_front("def")
5555
checkIdentical(res, "abcdef")
5656
}
57+
58+
test.String.encoding <- function() {
59+
a <- b <- "å"
60+
Encoding(a) <- "unknown"
61+
Encoding(b) <- "UTF-8"
62+
checkEquals(test_String_encoding(a), "unknown")
63+
checkEquals(test_String_encoding(b), "UTF-8")
64+
checkEquals(Encoding(test_String_set_encoding(a)), "UTF-8")
65+
checkEquals(Encoding(test_String_ctor_encoding(a)), "UTF-8")
66+
checkEquals(Encoding(test_String_ctor_encoding2()), "UTF-8")
67+
}
68+
5769
}

0 commit comments

Comments
 (0)