|
| 1 | +/* |
| 2 | + * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 |
| 3 | + * |
| 4 | + * =================================================================================================================================== |
| 5 | + * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): |
| 6 | + * |
| 7 | + * Copyright 2018-2020, CWI, TU Munich, FSU Jena |
| 8 | + * |
| 9 | + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files |
| 10 | + * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, |
| 11 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is |
| 12 | + * furnished to do so, subject to the following conditions: |
| 13 | + * |
| 14 | + * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
| 15 | + * |
| 16 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 17 | + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
| 18 | + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR |
| 19 | + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 20 | + * |
| 21 | + * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst |
| 22 | + * =================================================================================================================================== |
| 23 | + * |
| 24 | + * FSST: Fast Static Symbol Table compression |
| 25 | + * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf |
| 26 | + * |
| 27 | + * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. |
| 28 | + * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual |
| 29 | + * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is |
| 30 | + * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. |
| 31 | + * |
| 32 | + * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) |
| 33 | + * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression |
| 34 | + * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could |
| 35 | + * be seen as strings again and fit in whatever your program is that manipulates strings. |
| 36 | + * |
| 37 | + * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. |
| 38 | + * |
| 39 | + * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of |
| 40 | + * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. |
| 41 | + * |
| 42 | + * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program |
| 43 | + * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). |
| 44 | + * |
| 45 | + * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are |
| 46 | + * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. |
| 47 | + */ |
| 48 | +#ifndef FSST_INCLUDED_H |
| 49 | +#define FSST_INCLUDED_H |
| 50 | + |
| 51 | +#ifdef _MSC_VER |
| 52 | +#define __restrict__ |
| 53 | +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ |
| 54 | +#define __ORDER_LITTLE_ENDIAN__ 2 |
| 55 | +#include <intrin.h> |
| 56 | +static inline int __builtin_ctzl(unsigned long long x) { |
| 57 | + unsigned long ret; |
| 58 | + _BitScanForward64(&ret, x); |
| 59 | + return (int)ret; |
| 60 | +} |
| 61 | +#endif |
| 62 | + |
| 63 | +#ifdef __cplusplus |
| 64 | +#define FSST_FALLTHROUGH [[fallthrough]] |
| 65 | +#include <cstring> |
| 66 | +extern "C" { |
| 67 | +#else |
| 68 | +#define FSST_FALLTHROUGH |
| 69 | +#endif |
| 70 | + |
| 71 | +#include <stddef.h> |
| 72 | + |
| 73 | +/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ |
| 74 | +#define FSST_ESC 255 |
| 75 | + |
| 76 | +/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ |
| 77 | +typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ |
| 78 | + |
| 79 | +/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ |
| 80 | +typedef struct { |
| 81 | + unsigned long long version; /* version id */ |
| 82 | + unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ |
| 83 | + unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ |
| 84 | + unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ |
| 85 | +} fsst_decoder_t; |
| 86 | + |
| 87 | +/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ |
| 88 | +fsst_encoder_t* |
| 89 | +fsst_create( |
| 90 | + size_t n, /* IN: number of strings in batch to sample from. */ |
| 91 | + const size_t lenIn[], /* IN: byte-lengths of the inputs */ |
| 92 | + const unsigned char *strIn[], /* IN: string start pointers. */ |
| 93 | + int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ |
| 94 | +); |
| 95 | + |
| 96 | +/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ |
| 97 | +fsst_encoder_t* |
| 98 | +fsst_duplicate( |
| 99 | + fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ |
| 100 | +); |
| 101 | + |
| 102 | +#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ |
| 103 | + |
| 104 | +/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ |
| 105 | +unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ |
| 106 | +fsst_export( |
| 107 | + fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ |
| 108 | + unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ |
| 109 | +); |
| 110 | + |
| 111 | +/* Deallocate encoder. */ |
| 112 | +void |
| 113 | +fsst_destroy(fsst_encoder_t*); |
| 114 | + |
| 115 | +/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ |
| 116 | +unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ |
| 117 | +fsst_import( |
| 118 | + fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ |
| 119 | + unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ |
| 120 | +); |
| 121 | + |
| 122 | +/* Return a decoder structure from an encoder. */ |
| 123 | +fsst_decoder_t |
| 124 | +fsst_decoder( |
| 125 | + fsst_encoder_t *encoder |
| 126 | +); |
| 127 | + |
| 128 | +/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ |
| 129 | +/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ |
| 130 | +size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ |
| 131 | +fsst_compress( |
| 132 | + fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ |
| 133 | + size_t nstrings, /* IN: number of strings in batch to compress. */ |
| 134 | + const size_t lenIn[], /* IN: byte-lengths of the inputs */ |
| 135 | + const unsigned char *strIn[], /* IN: input string start pointers. */ |
| 136 | + size_t outsize, /* IN: byte-length of output buffer. */ |
| 137 | + unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ |
| 138 | + size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ |
| 139 | + unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ |
| 140 | +); |
| 141 | + |
| 142 | +/* Decompress a single string, inlined for speed. */ |
| 143 | +inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ |
| 144 | +fsst_decompress( |
| 145 | + const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ |
| 146 | + size_t lenIn, /* IN: byte-length of compressed string. */ |
| 147 | + const unsigned char *strIn, /* IN: compressed string. */ |
| 148 | + size_t size, /* IN: byte-length of output buffer. */ |
| 149 | + unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ |
| 150 | +) { |
| 151 | + unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; |
| 152 | + unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; |
| 153 | + unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; |
| 154 | + size_t code, posOut = 0, posIn = 0; |
| 155 | +#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ |
| 156 | +#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) |
| 157 | +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
| 158 | + while (posOut+32 <= size && posIn+4 <= lenIn) { |
| 159 | + unsigned int nextBlock, escapeMask; |
| 160 | + memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int)); |
| 161 | + escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); |
| 162 | + if (escapeMask == 0) { |
| 163 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 164 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 165 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 166 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 167 | + } else { |
| 168 | + unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; |
| 169 | + switch(firstEscapePos) { /* Duff's device */ |
| 170 | + case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 171 | + // fall through |
| 172 | + case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 173 | + // fall through |
| 174 | + case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 175 | + // fall through |
| 176 | + case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ |
| 177 | + } |
| 178 | + } |
| 179 | + } |
| 180 | + if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop |
| 181 | + if (posIn+2 <= lenIn) { |
| 182 | + strOut[posOut] = strIn[posIn+1]; |
| 183 | + if (strIn[posIn] != FSST_ESC) { |
| 184 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 185 | + if (strIn[posIn] != FSST_ESC) { |
| 186 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 187 | + } else { |
| 188 | + posIn += 2; strOut[posOut++] = strIn[posIn-1]; |
| 189 | + } |
| 190 | + } else { |
| 191 | + posIn += 2; posOut++; |
| 192 | + } |
| 193 | + } |
| 194 | + if (posIn < lenIn) { // last code cannot be an escape |
| 195 | + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
| 196 | + } |
| 197 | + } |
| 198 | +#else |
| 199 | + while (posOut+8 <= size && posIn < lenIn) |
| 200 | + if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ |
| 201 | + FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ |
| 202 | + posOut += len[code]; |
| 203 | + } else { |
| 204 | + strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ |
| 205 | + posIn++; posOut++; |
| 206 | + } |
| 207 | +#endif |
| 208 | +#endif |
| 209 | + while (posIn < lenIn) |
| 210 | + if ((code = strIn[posIn++]) < FSST_ESC) { |
| 211 | + size_t posWrite = posOut, endWrite = posOut + len[code]; |
| 212 | + unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; |
| 213 | + if ((posOut = endWrite) > size) endWrite = size; |
| 214 | + for(; posWrite < endWrite; posWrite++) /* only write if there is room */ |
| 215 | + strOut[posWrite] = symbolPointer[posWrite]; |
| 216 | + } else { |
| 217 | + if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ |
| 218 | + posIn++; posOut++; |
| 219 | + } |
| 220 | + if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; |
| 221 | + return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ |
| 222 | +} |
| 223 | + |
| 224 | +#ifdef __cplusplus |
| 225 | +} |
| 226 | +#endif |
| 227 | +#endif /* FSST_INCLUDED_H */ |
0 commit comments