@@ -89,110 +89,101 @@ xor 80 then << 1 then |
89
89
11 -> 01 -> 1
90
90
=#
91
91
92
- @inline _widen_mask (msk:: UInt ) = ((msk% BigChunk) << (8 * sizeof (UInt))) | msk
93
-
94
92
const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
95
93
const big_hi_mask = _widen_mask (hi_mask)
96
94
97
- @inline _count_cont (v, msk) = (v = xor (v, msk); count_ones (xor (((v << 1 ) | v), msk) & msk))
95
+ @inline get_high_mask (:: UInt ) = hi_mask
96
+ @inline get_high_mask (:: BigChunk ) = big_hi_mask
97
+
98
98
@inline msk_lead (v, msk) = (v = xor (v, msk); xor (xor (((v << 1 ) | v), msk) & msk, msk))
99
99
100
- @inline _count_cont (v:: UInt ) = _count_cont (v, hi_mask)
101
- @inline msk_lead (v:: UInt ) = msk_lead (v, hi_mask)
100
+ @inline msk_lead (v) = msk_lead (v, get_high_mask (v))
102
101
103
- @inline _count_cont (v:: BigChunk ) = _count_cont (v, big_hi_mask)
104
- @inline _msk_lead (v:: BigChunk ) = _msk_lead (v, big_hi_mask)
102
+ @inline get_lead (T, ptr) = msk_lead (unsafe_load (reinterpret (Ptr{T}, ptr)))
105
103
106
- @inline function _align_len_utf8 (pnt, cnt, v)
107
- len = 0
108
- fin = pnt + cnt
109
- v = msk_lead (v)
110
- while (pnt += CHUNKSZ) < fin
104
+ @inline count_masked (v, cnt) = count_ones (_mask_bytes (v, cnt))
105
+
106
+ function _length_al (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
107
+ # First check very frequent cases of short strings
108
+ # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
109
+ # taking advantage of the knowledge of how String types are stored in Julia,
110
+ # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
111
+ if cnt <= BIGCHUNKSZ
112
+ return (cnt <= CHUNKSZ
113
+ ? count_masked (get_lead (UInt, beg), cnt)
114
+ : count_masked (get_lead (BigChunk, beg), cnt))
115
+ end
116
+ len = count_ones (get_lead (UInt, beg))
117
+ cnt -= CHUNKSZ
118
+ pnt = _pntbigchunk (beg + CHUNKSZ)
119
+ v = get_lead (BigChunk, pnt)
120
+ cnt <= BIGCHUNKSZ && return len + count_masked (v, cnt)
121
+ fin = _pntbigchunk (beg + CHUNKSZ + cnt)
122
+ while (pnt += BIGCHUNKSZ) < fin
111
123
len += count_ones (v)
112
- v = msk_lead ( unsafe_load ( pnt) )
124
+ v = get_lead (BigChunk, pnt)
113
125
end
114
- len + count_ones (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt)) )
126
+ len + count_masked (v, cnt)
115
127
end
116
128
117
- _length_al (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int ) =
118
- (pnt = reinterpret (Ptr{UInt}, beg); _align_len_utf8 (pnt, cnt, unsafe_load (pnt)))
119
-
120
- function _length (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
129
+ function _length_ul (:: MultiCU , :: Type{UTF8CSE} , beg:: Ptr{UInt8} , cnt:: Int )
121
130
align = reinterpret (UInt, beg)
122
- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
131
+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
123
132
v = unsafe_load (pnt)
124
- if (align &= CHUNKMSK ) != 0
125
- msk = _mask_bytes (align)
126
- v = (v & ~ msk) | (msk & hi_mask )
133
+ if (align &= BIGCHUNKMSK ) != 0
134
+ msk = _big_mask_bytes (align)
135
+ v = (v & ~ msk) | (msk & big_hi_mask )
127
136
cnt += align
128
137
end
129
- _align_len_utf8 (pnt, cnt, v)
130
- end
131
-
132
- @inline function _check_mask_al (pnt, cnt, msk, v)
138
+ len = 0
133
139
fin = pnt + cnt
134
- while (pnt += CHUNKSZ) < fin
135
- (v & msk) == 0 || return false
136
- v = unsafe_load (pnt)
137
- end
138
- (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes (cnt))) & msk == 0
139
- end
140
- @inline _check_mask_al (pnt, cnt, msk) = _check_mask_al (pnt, cnt, msk, unsafe_load (pnt))
141
-
142
- @inline function _check_mask_ul (beg, cnt, msk)
143
- align = reinterpret (UInt, beg)
144
- pnt = reinterpret (Ptr{UInt}, align & ~ CHUNKMSK)
145
- v = unsafe_load (pnt)
146
- if (align &= CHUNKMSK) != 0
147
- v &= ~ _mask_bytes (align)
148
- cnt += align
140
+ v = msk_lead (v)
141
+ while (pnt += BIGCHUNKSZ) < fin
142
+ len += count_ones (v)
143
+ v = msk_lead (unsafe_load (pnt))
149
144
end
150
- _check_mask_al (pnt, cnt, msk, v )
145
+ len + count_masked (v, cnt )
151
146
end
152
147
153
- @inline _mask_bytes (v:: T , cnt) where {T} =
154
- ifelse ((cnt & (sizeof (T)- 1 )% UInt) == 0 ,
155
- v, T (v & (one (T) << ((cnt & (sizeof (T)- 1 )% UInt) << 3 )) - 1 ))
156
-
157
- @inline chk_chunk (ptr, msk:: T , cnt) where {T} =
158
- iszero (_mask_bytes (unsafe_load (reinterpret (Ptr{T}, ptr)) & msk, cnt))
148
+ @inline get_chunk (ptr, msk:: T , cnt) where {T} =
149
+ _mask_bytes (unsafe_load (reinterpret (Ptr{T}, ptr)) & msk, cnt)
159
150
160
- @inline function _check_block_al (ptr, cnt, msk)
151
+ @inline function _check_mask_al (ptr, cnt, msk)
161
152
# First check very frequent cases of short strings
162
153
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
163
154
# taking advantage of the knowledge of how String types are stored in Julia,
164
155
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
165
- cnt <= CHUNKSZ && return chk_chunk (ptr, msk, cnt)
156
+ cnt <= CHUNKSZ && return get_chunk (ptr, msk, cnt) == 0
166
157
bigmsk = _widen_mask (msk)
167
- cnt <= BIGCHUNKSZ && return chk_chunk (ptr, bigmsk, cnt)
158
+ cnt <= BIGCHUNKSZ && return get_chunk (ptr, bigmsk, cnt) == 0
168
159
(unsafe_load (_pntchunk (ptr)) & msk) == 0 || return false
169
160
cnt -= CHUNKSZ
170
- cnt <= BIGCHUNKSZ && return chk_chunk (ptr, bigmsk, cnt)
161
+ cnt <= BIGCHUNKSZ && return get_chunk (ptr, bigmsk, cnt) == 0
171
162
pnt = _pntbigchunk (ptr + CHUNKSZ)
172
163
fin = _pntbigchunk (ptr + CHUNKSZ + cnt)
173
- v = unsafe_load (pnt) & bigmsk
164
+ v = unsafe_load (pnt)
174
165
while (pnt += BIGCHUNKSZ) < fin
175
- v == 0 || return false
176
- v = unsafe_load (pnt) & bigmsk
166
+ (v & bigmsk) == 0 || return false
167
+ v = unsafe_load (pnt)
177
168
end
178
- iszero ( _mask_bytes (v, cnt))
169
+ _mask_bytes (v & bigmsk , cnt) == 0
179
170
end
180
171
181
- @inline function _check_block_ul (beg, cnt, msk)
172
+ @inline function _check_mask_ul (beg, cnt, msk)
173
+ bigmsk = _widen_mask (msk)
182
174
align = reinterpret (UInt, beg)
183
175
pnt = _pntbigchunk (align & ~ BIGCHUNKMSK)
184
176
v = unsafe_load (pnt)
185
177
if (align &= BIGCHUNKMSK) != 0
186
178
v &= ~ _big_mask_bytes (align)
187
179
cnt += align
188
180
end
189
- fin = _pntbigchunk (pnt + cnt)
190
- bigmsk = _widen_mask (msk)
181
+ fin = pnt + cnt
191
182
while (pnt += BIGCHUNKSZ) < fin
192
183
(v & bigmsk) == 0 || return false
193
184
v = unsafe_load (pnt)
194
185
end
195
- ((cnt & BIGCHUNKMSK) == 0 ? v : (v & _big_mask_bytes ( cnt))) & bigmsk == 0
186
+ _mask_bytes (v & bigmsk, cnt) == 0
196
187
end
197
188
198
189
_ascii_mask (:: Type{UInt8} ) = hi_mask
@@ -226,88 +217,99 @@ is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
226
217
227
218
is_ascii (str:: Str{C} ) where {C<: ASCII_Union } =
228
219
(cnt = sizeof (str)) == 0 ||
229
- (@preserve str _check_block_al (pointer (str), cnt, _ascii_mask (codeunit (C))))
220
+ (@preserve str _check_mask_al (pointer (str), cnt, _ascii_mask (codeunit (C))))
230
221
231
222
# Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
232
223
# which indicates a non-Latin1 character
233
- _all_latin (val) = ((val & (val<< 1 ) & (val<< 2 | (val<< 3 ) | (val<< 4 ) | (val<< 5 ))) & hi_mask) == 0
224
+ _all_latin (val) =
225
+ ((val & (val<< 1 ) & (val<< 2 | (val<< 3 ) | (val<< 4 ) | (val<< 5 ))) & get_high_mask (val)) == 0
234
226
235
- @inline function _check_latin_utf8_al (pnt, cnt, v)
227
+ @inline function _check_latin_utf8_al (beg, cnt)
228
+ pnt = reinterpret (Ptr{UInt}, beg)
236
229
fin = pnt + cnt
230
+ v = unsafe_load (pnt)
237
231
while (pnt += CHUNKSZ) < fin
238
232
_all_latin (v) || return false
239
233
v = unsafe_load (pnt)
240
234
end
241
- _all_latin (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt) ))
235
+ _all_latin (_mask_bytes (v, cnt))
242
236
end
243
- @inline _check_latin_utf8_al (pnt, cnt) = _check_latin_utf8_al (pnt, cnt, unsafe_load (pnt))
244
237
245
238
@inline function _check_latin_utf8_ul (beg, cnt)
246
239
align = reinterpret (UInt, beg)
247
- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
240
+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
248
241
v = unsafe_load (pnt)
249
- if (align &= CHUNKMSK ) != 0
250
- v &= ~ _mask_bytes (align)
242
+ if (align &= BIGCHUNKMSK ) != 0
243
+ v &= ~ _big_mask_bytes (align)
251
244
cnt += align
252
245
end
253
- _check_latin_utf8_al (pnt, cnt, v)
246
+ fin = pnt + cnt
247
+ while (pnt += BIGCHUNKSZ) < fin
248
+ _all_latin (v) || return false
249
+ v = unsafe_load (pnt)
250
+ end
251
+ _all_latin (_mask_bytes (v, cnt))
254
252
end
255
253
256
254
is_latin (str:: Str{UTF8CSE} ) =
257
- (siz = sizeof (str)) == 0 ? true :
258
- @preserve str _check_latin_utf8_al (reinterpret (Ptr{UInt}, pointer (str)), siz)
255
+ (siz = sizeof (str)) == 0 || @preserve str _check_latin_utf8_al (pointer (str), siz)
259
256
260
257
is_latin (str:: SubString{<:Str{UTF8CSE}} ) =
261
- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_latin_utf8_ul (pointer (str), cnt)
258
+ (cnt = sizeof (str)) == 0 || @preserve str _check_latin_utf8_ul (pointer (str), cnt)
262
259
263
260
is_latin (vec:: Vector{T} ) where {T<: Union{UInt16,UInt32} } =
264
- (cnt = sizeof (vec)) == 0 ? true :
261
+ (cnt = sizeof (vec)) == 0 ||
265
262
@preserve vec _check_mask_ul (pointer (vec), cnt, _latin_mask (T))
266
263
267
264
is_latin (str:: SubString{<:Str{C}} ) where {C<: Union{Word_CSEs,Quad_CSEs} } =
268
- (cnt = sizeof (str)) == 0 ? true :
265
+ (cnt = sizeof (str)) == 0 ||
269
266
@preserve str _check_mask_ul (pointer (str), cnt, _latin_mask (codeunit (C)))
270
267
271
268
is_latin (str:: Str{C} ) where {C<: Union{Word_CSEs,Quad_CSEs} } =
272
- (cnt = sizeof (str)) == 0 ? true :
269
+ (cnt = sizeof (str)) == 0 ||
273
270
@preserve str _check_mask_al (pointer (str), cnt, _latin_mask (codeunit (C)))
274
271
275
272
# All 4 top bits must be 1 (i.e. 0xfx) for this to be non-BMP
276
- _all_bmp (val) = ((val | (val<< 1 ) | (val<< 2 ) | (val<< 3 )) & hi_mask ) == 0
273
+ _all_bmp (val) = ((val | (val<< 1 ) | (val<< 2 ) | (val<< 3 )) & get_high_mask (val) ) == 0
277
274
278
- @inline function _check_bmp_utf8_al (pnt, cnt, v)
275
+ @inline function _check_bmp_utf8_al (beg, cnt)
276
+ pnt = reinterpret (Ptr{UInt}, beg)
279
277
fin = pnt + cnt
278
+ v = unsafe_load (pnt)
280
279
while (pnt += CHUNKSZ) < fin
281
280
_all_bmp (v) || return false
282
281
v = unsafe_load (pnt)
283
282
end
284
- _all_bmp (cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes ( cnt) ))
283
+ _all_bmp (_mask_bytes (v, cnt))
285
284
end
286
- @inline _check_bmp_utf8_al (pnt, cnt) = _check_bmp_utf8_al (pnt, cnt, unsafe_load (pnt))
287
285
288
286
@inline function _check_bmp_utf8_ul (beg, cnt)
289
287
align = reinterpret (UInt, beg)
290
- pnt = reinterpret (Ptr{UInt }, align & ~ CHUNKMSK )
288
+ pnt = reinterpret (Ptr{BigChunk }, align & ~ BIGCHUNKMSK )
291
289
v = unsafe_load (pnt)
292
- if (align &= CHUNKMSK ) != 0
293
- v &= ~ _mask_bytes (align)
290
+ if (align &= BIGCHUNKMSK ) != 0
291
+ v &= ~ _big_mask_bytes (align)
294
292
cnt += align
295
293
end
296
- _check_bmp_utf8_al (pnt, cnt, v)
294
+ fin = pnt + cnt
295
+ while (pnt += BIGCHUNKSZ) < fin
296
+ _all_bmp (v) || return false
297
+ v = unsafe_load (pnt)
298
+ end
299
+ _all_bmp (_mask_bytes (v, cnt))
297
300
end
298
301
299
302
is_bmp (str:: Str{UTF8CSE} ) =
300
- (cnt = sizeof (str)) == 0 ? true :
301
- @preserve str _check_bmp_utf8_al (reinterpret (Ptr{UInt}, pointer (str)), cnt)
303
+ (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf8_al (pointer (str), cnt)
302
304
303
305
is_bmp (str:: SubString{<:Str{UTF8CSE}} ) =
304
- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_bmp_utf8_ul (pointer (str), cnt)
306
+ (cnt = sizeof (str)) == 0 || @preserve str _check_bmp_utf8_ul (pointer (str), cnt)
305
307
306
308
is_bmp (str:: SubString{<:Str{<:Union{Text4CSE,UTF32CSE}}} ) =
307
- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_mask_ul (pointer (str), cnt, _bmp_mask_32)
309
+ (cnt = sizeof (str)) == 0 || @preserve str _check_mask_ul (pointer (str), cnt, _bmp_mask_32)
308
310
309
311
is_bmp (str:: Str{<:Union{Text4CSE,UTF32CSE}} ) =
310
- (cnt = sizeof (str)) == 0 ? true : @preserve str _check_mask_al (pointer (str), cnt, _bmp_mask_32)
312
+ (cnt = sizeof (str)) == 0 || @preserve str _check_mask_al (pointer (str), cnt, _bmp_mask_32)
311
313
312
314
is_unicode (str:: MS_UTF8 ) = true
313
315
0 commit comments