@@ -219,6 +219,74 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
219219 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
220220 private unsafe static ulong CompressBlock ( ref Block64 b , ulong mask , byte * output , byte * tablePtr )
221221 {
222+
223+ // if mask is a power of 2, we can use a simpler version
224+ if ( ( mask & ( mask - 1 ) ) == 0 ) // check if mask is a power of 2
225+ {
226+ int pos64 = ArmBase . Arm64 . LeadingZeroCount ( mask ) ;
227+ int pos = pos64 & 0xf ;
228+ Vector128 < byte > v1 = Vector128 . Create ( ( byte ) 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ) ;
229+
230+
231+ Vector128 < byte > v0 = Vector128 . Create ( ( byte ) ( 0xe - pos ) ) ;
232+ switch ( pos64 >> 4 )
233+ {
234+ case 3 :
235+ {
236+ Vector128 < byte > v2 = AdvSimd . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
237+ Vector128 < byte > sh = AdvSimd . Subtract ( v1 , v2 ) ;
238+ Vector128 < byte > compressed = AdvSimd . Arm64 . VectorTableLookup ( b . chunk0 , sh ) ;
239+ Vector128 . Store ( compressed , output + 0 * 16 ) ;
240+ Vector128 . Store ( b . chunk1 , output + 1 * 16 - 1 ) ;
241+ Vector128 . Store ( b . chunk2 , output + 2 * 16 - 1 ) ;
242+ Vector128 . Store ( b . chunk3 , output + 3 * 16 - 1 ) ;
243+
244+ }
245+ break ;
246+
247+ case 2 :
248+ {
249+ Vector128 < byte > v2 = AdvSimd . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
250+ Vector128 < byte > sh = AdvSimd . Subtract ( v1 , v2 ) ;
251+ Vector128 < byte > compressed = AdvSimd . Arm64 . VectorTableLookup ( b . chunk1 , sh ) ;
252+ Vector128 . Store ( b . chunk0 , output + 0 * 16 ) ;
253+ Vector128 . Store ( compressed , output + 1 * 16 ) ;
254+ Vector128 . Store ( b . chunk2 , output + 2 * 16 - 1 ) ;
255+ Vector128 . Store ( b . chunk3 , output + 3 * 16 - 1 ) ;
256+
257+ }
258+ break ;
259+
260+ case 1 :
261+ {
262+ Vector128 < byte > v2 = AdvSimd . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
263+ Vector128 < byte > sh = AdvSimd . Subtract ( v1 , v2 ) ;
264+ Vector128 < byte > compressed = AdvSimd . Arm64 . VectorTableLookup ( b . chunk2 , sh ) ;
265+ Vector128 . Store ( b . chunk0 , output + 0 * 16 ) ;
266+ Vector128 . Store ( b . chunk1 , output + 1 * 16 ) ;
267+ Vector128 . Store ( compressed , output + 2 * 16 ) ;
268+ Vector128 . Store ( b . chunk3 , output + 3 * 16 - 1 ) ;
269+
270+ }
271+ break ;
272+
273+ case 0 :
274+ {
275+ Vector128 < byte > v2 = AdvSimd . CompareGreaterThan ( v1 . AsSByte ( ) , v0 . AsSByte ( ) ) . AsByte ( ) ;
276+ Vector128 < byte > sh = AdvSimd . Subtract ( v1 , v2 ) ;
277+ Vector128 < byte > compressed = AdvSimd . Arm64 . VectorTableLookup ( b . chunk2 , sh ) ;
278+ Vector128 . Store ( b . chunk0 , output + 0 * 16 ) ;
279+ Vector128 . Store ( b . chunk1 , output + 1 * 16 ) ;
280+ Vector128 . Store ( b . chunk2 , output + 2 * 16 ) ;
281+ Vector128 . Store ( compressed , output + 3 * 16 ) ;
282+ }
283+ break ;
284+ }
285+
286+
287+ return 63 ;
288+
289+ }
222290 ulong nmask = ~ mask ;
223291 Compress ( b . chunk0 , ( ushort ) mask , output , tablePtr ) ;
224292 Compress ( b . chunk1 , ( ushort ) ( mask >> 16 ) , output + UInt64 . PopCount ( nmask & 0xFFFF ) , tablePtr ) ;
0 commit comments