Skip to content

Commit c49f19c

Browse files
committed
vendor fsst
1 parent f9b43c4 commit c49f19c

File tree

9 files changed

+2072
-36
lines changed

9 files changed

+2072
-36
lines changed

cpp/cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -643,16 +643,6 @@ else()
643643
)
644644
endif()
645645

646-
set(FSST_SOURCE_URL "")
647-
set(FSST_GIT_REPOSITORY "")
648-
if(DEFINED ENV{ARROW_FSST_URL})
649-
set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}")
650-
elseif(DEFINED ENV{ARROW_FSST_GIT_REPOSITORY})
651-
set(FSST_GIT_REPOSITORY "$ENV{ARROW_FSST_GIT_REPOSITORY}")
652-
else()
653-
set(FSST_GIT_REPOSITORY "https://github.com/cwida/fsst.git")
654-
endif()
655-
656646
if(DEFINED ENV{ARROW_GBENCHMARK_URL})
657647
set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}")
658648
else()
@@ -2621,34 +2611,11 @@ if(ARROW_USE_XSIMD)
26212611
endif()
26222612

26232613
function(build_fsst)
2624-
message(STATUS "Building FSST from source using FetchContent")
2625-
2626-
if(FSST_SOURCE_URL)
2627-
fetchcontent_declare(fsst
2628-
${FC_DECLARE_COMMON_OPTIONS}
2629-
URL ${FSST_SOURCE_URL}
2630-
URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
2631-
else()
2632-
if(NOT FSST_GIT_REPOSITORY)
2633-
message(FATAL_ERROR "FSST_GIT_REPOSITORY is not set and no FSST_SOURCE_URL override was provided.")
2634-
endif()
2635-
fetchcontent_declare(fsst
2636-
${FC_DECLARE_COMMON_OPTIONS}
2637-
GIT_REPOSITORY ${FSST_GIT_REPOSITORY}
2638-
GIT_TAG ${ARROW_FSST_BUILD_VERSION}
2639-
GIT_SHALLOW TRUE
2640-
GIT_PROGRESS TRUE)
2641-
endif()
2642-
2643-
prepare_fetchcontent()
2644-
fetchcontent_getproperties(fsst)
2645-
if(NOT fsst_POPULATED)
2646-
fetchcontent_populate(fsst)
2647-
endif()
2614+
message(STATUS "Configuring vendored FSST sources")
26482615

2649-
set(ARROW_FSST_INCLUDE_DIR "${fsst_SOURCE_DIR}" PARENT_SCOPE)
2616+
set(ARROW_FSST_INCLUDE_DIR "${ARROW_SOURCE_DIR}/thirdparty/fsst" PARENT_SCOPE)
26502617
set(ARROW_FSST_SOURCES
2651-
"${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp"
2618+
"${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp"
26522619
PARENT_SCOPE)
26532620
set(FSST_VENDORED TRUE PARENT_SCOPE)
26542621
endfunction()

cpp/thirdparty/fsst/fsst.h

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/*
2+
* the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
3+
*
4+
* ===================================================================================================================================
5+
* this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
6+
*
7+
* Copyright 2018-2020, CWI, TU Munich, FSU Jena
8+
*
9+
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
10+
* (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
11+
* merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
12+
* furnished to do so, subject to the following conditions:
13+
*
14+
* - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18+
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
19+
* IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20+
*
21+
* You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
22+
* ===================================================================================================================================
23+
*
24+
* FSST: Fast Static Symbol Table compression
25+
* see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
26+
*
27+
* FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
28+
* where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
29+
* strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is
30+
* block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
31+
*
32+
* FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences)
33+
* onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression
34+
* transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could
35+
* be seen as strings again and fit in whatever your program is that manipulates strings.
36+
*
37+
* useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
38+
*
39+
* In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of
40+
* unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of.
41+
*
42+
* This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
43+
* that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
44+
*
45+
* We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are
46+
* also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
47+
*/
48+
#ifndef FSST_INCLUDED_H
49+
#define FSST_INCLUDED_H
50+
51+
#ifdef _MSC_VER
52+
#define __restrict__
53+
#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
54+
#define __ORDER_LITTLE_ENDIAN__ 2
55+
#include <intrin.h>
56+
static inline int __builtin_ctzl(unsigned long long x) {
57+
unsigned long ret;
58+
_BitScanForward64(&ret, x);
59+
return (int)ret;
60+
}
61+
#endif
62+
63+
#ifdef __cplusplus
64+
#define FSST_FALLTHROUGH [[fallthrough]]
65+
#include <cstring>
66+
extern "C" {
67+
#else
68+
#define FSST_FALLTHROUGH
69+
#endif
70+
71+
#include <stddef.h>
72+
73+
/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
74+
#define FSST_ESC 255
75+
76+
/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
77+
typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
78+
79+
/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
80+
typedef struct {
81+
unsigned long long version; /* version id */
82+
unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */
83+
unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
84+
unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */
85+
} fsst_decoder_t;
86+
87+
/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
88+
fsst_encoder_t*
89+
fsst_create(
90+
size_t n, /* IN: number of strings in batch to sample from. */
91+
const size_t lenIn[], /* IN: byte-lengths of the inputs */
92+
const unsigned char *strIn[], /* IN: string start pointers. */
93+
int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
94+
);
95+
96+
/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */
97+
fsst_encoder_t*
98+
fsst_duplicate(
99+
fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */
100+
);
101+
102+
#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
103+
104+
/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
105+
unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
106+
fsst_export(
107+
fsst_encoder_t *encoder, /* IN: the symbol table to dump. */
108+
unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
109+
);
110+
111+
/* Deallocate encoder. */
112+
void
113+
fsst_destroy(fsst_encoder_t*);
114+
115+
/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
116+
unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */
117+
fsst_import(
118+
fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */
119+
unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
120+
);
121+
122+
/* Return a decoder structure from an encoder. */
123+
fsst_decoder_t
124+
fsst_decoder(
125+
fsst_encoder_t *encoder
126+
);
127+
128+
/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
129+
/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
130+
size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */
131+
fsst_compress(
132+
fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
133+
size_t nstrings, /* IN: number of strings in batch to compress. */
134+
const size_t lenIn[], /* IN: byte-lengths of the inputs */
135+
const unsigned char *strIn[], /* IN: input string start pointers. */
136+
size_t outsize, /* IN: byte-length of output buffer. */
137+
unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */
138+
size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */
139+
unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */
140+
);
141+
142+
/* Decompress a single string, inlined for speed. */
143+
inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
144+
fsst_decompress(
145+
const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */
146+
size_t lenIn, /* IN: byte-length of compressed string. */
147+
const unsigned char *strIn, /* IN: compressed string. */
148+
size_t size, /* IN: byte-length of output buffer. */
149+
unsigned char *output /* OUT: memory buffer to put the decompressed string in. */
150+
) {
151+
unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
152+
unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
153+
unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol;
154+
size_t code, posOut = 0, posIn = 0;
155+
#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
156+
#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
157+
#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
158+
while (posOut+32 <= size && posIn+4 <= lenIn) {
159+
unsigned int nextBlock, escapeMask;
160+
memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
161+
escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
162+
if (escapeMask == 0) {
163+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
164+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
165+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
166+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
167+
} else {
168+
unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
169+
switch(firstEscapePos) { /* Duff's device */
170+
case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
171+
// fall through
172+
case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
173+
// fall through
174+
case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
175+
// fall through
176+
case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
177+
}
178+
}
179+
}
180+
if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop
181+
if (posIn+2 <= lenIn) {
182+
strOut[posOut] = strIn[posIn+1];
183+
if (strIn[posIn] != FSST_ESC) {
184+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
185+
if (strIn[posIn] != FSST_ESC) {
186+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
187+
} else {
188+
posIn += 2; strOut[posOut++] = strIn[posIn-1];
189+
}
190+
} else {
191+
posIn += 2; posOut++;
192+
}
193+
}
194+
if (posIn < lenIn) { // last code cannot be an escape
195+
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
196+
}
197+
}
198+
#else
199+
while (posOut+8 <= size && posIn < lenIn)
200+
if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
201+
FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
202+
posOut += len[code];
203+
} else {
204+
strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
205+
posIn++; posOut++;
206+
}
207+
#endif
208+
#endif
209+
while (posIn < lenIn)
210+
if ((code = strIn[posIn++]) < FSST_ESC) {
211+
size_t posWrite = posOut, endWrite = posOut + len[code];
212+
unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
213+
if ((posOut = endWrite) > size) endWrite = size;
214+
for(; posWrite < endWrite; posWrite++) /* only write if there is room */
215+
strOut[posWrite] = symbolPointer[posWrite];
216+
} else {
217+
if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
218+
posIn++; posOut++;
219+
}
220+
if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
221+
return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
222+
}
223+
224+
#ifdef __cplusplus
225+
}
226+
#endif
227+
#endif /* FSST_INCLUDED_H */

0 commit comments

Comments
 (0)