44#include < DataTypes/DataTypeString.h>
55#include < DataTypes/DataTypesNumber.h>
66#include < Functions/FunctionFactory.h>
7- #include < Functions/stringBytes.h>
87#include < Functions/IFunction.h>
8+ #include < Functions/stringBytes.h>
99#include < Common/BitHelpers.h>
1010#include < Common/PODArray.h>
1111
1414namespace DB
1515{
1616
17- class ByteCounters
18- {
19- private:
20- static constexpr size_t COUNTERS_SIZE = 256 ;
21- UInt32 counters[COUNTERS_SIZE] = {0 };
22- UInt32 current_generation = 0 ;
23- UInt32 generation_mask = 0x80000000 ;
24- size_t total_count = 0 ;
25-
26- public:
27- void add (UInt8 byte)
28- {
29- UInt32 & counter = counters[byte];
30- if ((counter & generation_mask) != current_generation)
31- counter = current_generation;
32- ++counter;
33- ++total_count;
34- }
35-
36- void nextString ()
37- {
38- current_generation = current_generation ? 0 : generation_mask;
39- total_count = 0 ;
40- }
41-
42- UInt32 get (UInt8 byte) const
43- {
44- UInt32 counter = counters[byte];
45- if ((counter & generation_mask) != current_generation)
46- return 0 ;
47- return counter & ~generation_mask;
48- }
49-
50- size_t getTotalCount () const { return total_count; }
51- };
52-
53-
5417struct StringBytesEntropyImpl
5518{
5619 using ResultType = Float64;
@@ -60,21 +23,20 @@ struct StringBytesEntropyImpl
6023 if (size == 0 )
6124 return 0 ;
6225
63- ByteCounters counters;
26+ std::array<UInt32, 256 > counters{} ;
6427 const UInt8 * end = data + size;
6528
6629 for (; data < end; ++data)
67- counters. add ( *data) ;
30+ counters[ *data]++ ;
6831
6932 Float64 entropy = 0.0 ;
70- size_t total = counters.getTotalCount ();
7133
7234 for (size_t byte = 0 ; byte < 256 ; ++byte)
7335 {
74- UInt32 count = counters. get ( byte) ;
36+ UInt32 count = counters[ byte] ;
7537 if (count > 0 )
7638 {
77- Float64 p = static_cast <Float64>(count) / total ;
39+ Float64 p = static_cast <Float64>(count) / size ;
7840 entropy -= p * std::log2 (p);
7941 }
8042 }
0 commit comments