Skip to content

Commit 486dd7a

Browse files
committed
chmatchMain(): replace TRUELENGTH marks with hash
1 parent 86511ab commit 486dd7a

File tree

1 file changed

+10
-35
lines changed

1 file changed

+10
-35
lines changed

src/chmatch.c

Lines changed: 10 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
3636
}
3737
// Since non-ASCII strings may be marked with different encodings, it only make sense to compare
3838
// the bytes under a same encoding (UTF-8) #3844 #3850.
39-
// Not 'const' because we might SET_TRUELENGTH() below.
40-
SEXP *xd;
39+
const SEXP *xd;
4140
if (isSymbol(x)) {
4241
xd = &sym;
4342
} else {
@@ -56,33 +55,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
5655
return ans;
5756
}
5857
// else xlen>1; nprotect is const above since no more R allocations should occur after this point
59-
savetl_init();
60-
for (int i=0; i<xlen; i++) {
61-
SEXP s = xd[i];
62-
const int tl = TRUELENGTH(s);
63-
if (tl>0) {
64-
savetl(s); // R's internal hash (which is positive); save it
65-
SET_TRUELENGTH(s,0);
66-
} else if (tl<0) {
67-
// R 2.14.0+ initializes truelength to 0 (before that it was uninitialized/random).
68-
// Now that data.table depends on R 3.1.0+, that is after 2.14.0 too.
69-
// We rely on that 0-initialization, and that R's internal hash is positive.
70-
// # nocov start
71-
savetl_end();
72-
internal_error(__func__, "CHARSXP '%s' has a negative truelength (%d)", CHAR(s), tl); // # nocov
73-
// # nocov end
74-
}
75-
}
58+
hashtab * marks = hash_create(tablelen);
7659
int nuniq=0;
7760
for (int i=0; i<tablelen; ++i) {
7861
const SEXP s = td[i];
79-
int tl = TRUELENGTH(s);
80-
if (tl>0) { savetl(s); tl=0; }
81-
if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
62+
int tl = hash_lookup(marks, s, 0);
63+
if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
8264
}
8365
// in future if we need NAs in x not to be matched to NAs in table ...
84-
// if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0)
85-
// SET_TRUELENGTH(NA_STRING, 0);
8666
if (chmatchdup) {
8767
// chmatchdup() is basically base::pmatch() but without the partial matching part. For example :
8868
// chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
@@ -101,21 +81,19 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
10181
if (!counts || !map) {
10282
// # nocov start
10383
free(counts); free(map);
104-
for (int i=0; i<tablelen; i++) SET_TRUELENGTH(td[i], 0);
105-
savetl_end();
10684
error(_("Failed to allocate %"PRIu64" bytes working memory in chmatchdup: length(table)=%d length(unique(table))=%d"), ((uint64_t)tablelen*2+nuniq)*sizeof(int), tablelen, nuniq);
10785
// # nocov end
10886
}
109-
for (int i=0; i<tablelen; ++i) counts[-TRUELENGTH(td[i])-1]++;
87+
for (int i=0; i<tablelen; ++i) counts[-hash_lookup(marks, td[i], 0)-1]++;
11088
for (int i=0, sum=0; i<nuniq; ++i) { int tt=counts[i]; counts[i]=sum; sum+=tt+1; }
111-
for (int i=0; i<tablelen; ++i) map[counts[-TRUELENGTH(td[i])-1]++] = i+1; // 0 is left ending each group thanks to the calloc
89+
for (int i=0; i<tablelen; ++i) map[counts[-hash_lookup(marks, td[i], 0)-1]++] = i+1; // 0 is left ending each group thanks to the calloc
11290
for (int i=0, last=0; i<nuniq; ++i) {int tt=counts[i]+1; counts[i]=last; last=tt;} // rewind counts to the beginning of each group
11391
for (int i=0; i<xlen; ++i) {
114-
int u = TRUELENGTH(xd[i]);
92+
int u = hash_lookup(marks, xd[i], 0);
11593
if (u<0) {
11694
const int w = counts[-u-1]++;
11795
if (map[w]) { ansd[i]=map[w]; continue; }
118-
SET_TRUELENGTH(xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
96+
hash_set(marks,xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
11997
// we still need the 0-setting loop at the end of this function because often there will be some values in table that are not matched to at all.
12098
}
12199
ansd[i] = nomatch;
@@ -124,17 +102,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
124102
free(map);
125103
} else if (chin) {
126104
for (int i=0; i<xlen; i++) {
127-
ansd[i] = TRUELENGTH(xd[i])<0;
105+
ansd[i] = hash_lookup(marks,xd[i],0)<0;
128106
}
129107
} else {
130108
for (int i=0; i<xlen; i++) {
131-
const int m = TRUELENGTH(xd[i]);
109+
const int m = hash_lookup(marks,xd[i],0);
132110
ansd[i] = (m<0) ? -m : nomatch;
133111
}
134112
}
135-
for (int i=0; i<tablelen; i++)
136-
SET_TRUELENGTH(td[i], 0); // reinstate 0 rather than leave the -i-1
137-
savetl_end();
138113
UNPROTECT(nprotect); // ans, xd, td
139114
return ans;
140115
}

0 commit comments

Comments
 (0)