Skip to content

Commit 44b7e9f

Browse files
Merge pull request ClickHouse#79350 from sachinkumarsingh092/string-bytes-functions
feat: Add stringBytesUniq and stringBytesEntropy functions
2 parents e4b0ea7 + 7e26ad4 commit 44b7e9f

File tree

8 files changed

+462
-0
lines changed

8 files changed

+462
-0
lines changed

ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2836,6 +2836,8 @@ stochasticlinearregression
28362836
stochasticlogisticregression
28372837
storages
28382838
storig
2839+
stringBytesEntropy
2840+
stringBytesUniq
28392841
stringCompare
28402842
stringJaccardIndex
28412843
stringJaccardIndexUTF

docs/en/sql-reference/functions/string-functions.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2637,3 +2637,67 @@ Result:
26372637
1. │ [417784657,728683856,3071092609] │
26382638
└──────────────────────────────────┘
26392639
```
2640+
2641+
## stringBytesUniq {#stringbytesuniq}
2642+
2643+
Counts the number of distinct bytes in a string.
2644+
2645+
**Syntax**
2646+
2647+
```sql
2648+
stringBytesUniq(s)
2649+
```
2650+
2651+
**Arguments**
2652+
2653+
- `s` — The string to analyze. [String](../data-types/string.md).
2654+
2655+
**Returned value**
2656+
2657+
- The number of distinct bytes in the string. [UInt16](../data-types/int-uint.md).
2658+
2659+
**Example**
2660+
2661+
```sql
2662+
SELECT stringBytesUniq('Hello');
2663+
```
2664+
2665+
Result:
2666+
2667+
```result
2668+
┌─stringBytesUniq('Hello')─┐
2669+
│ 4 │
2670+
└──────────────────────────┘
2671+
```
2672+
2673+
## stringBytesEntropy {#stringbytesentropy}
2674+
2675+
Calculates Shannon's entropy of byte distribution in a string.
2676+
2677+
**Syntax**
2678+
2679+
```sql
2680+
stringBytesEntropy(s)
2681+
```
2682+
2683+
**Arguments**
2684+
2685+
- `s` — The string to analyze. [String](../data-types/string.md).
2686+
2687+
**Returned value**
2688+
2689+
- Shannon's entropy of byte distribution in the string. [Float64](../data-types/float.md).
2690+
2691+
**Example**
2692+
2693+
```sql
2694+
SELECT stringBytesEntropy('Hello, world!');
2695+
```
2696+
2697+
Result:
2698+
2699+
```result
2700+
┌─stringBytesEntropy('Hello, world!')─┐
2701+
│ 3.07049960 │
2702+
└─────────────────────────────────────┘
2703+
```

src/Functions/stringBytes.h

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#pragma once
2+
3+
#include <Columns/ColumnString.h>
4+
#include <Columns/ColumnVector.h>
5+
#include <DataTypes/DataTypeString.h>
6+
#include <DataTypes/DataTypesNumber.h>
7+
#include <Functions/IFunction.h>
8+
#include <Common/BitHelpers.h>
9+
10+
#include <cmath>
11+
12+
namespace DB
13+
{
14+
15+
namespace ErrorCodes
16+
{
17+
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
18+
extern const int ILLEGAL_COLUMN;
19+
}
20+
21+
template <typename Impl, typename Name>
22+
class FunctionStringBytes : public IFunction
23+
{
24+
public:
25+
static constexpr auto name = Name::name;
26+
using ResultType = typename Impl::ResultType;
27+
28+
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionStringBytes>(); }
29+
30+
String getName() const override { return name; }
31+
32+
size_t getNumberOfArguments() const override { return 1; }
33+
34+
bool useDefaultImplementationForConstants() const override { return true; }
35+
36+
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return true; }
37+
38+
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
39+
{
40+
if (!isString(arguments[0].type))
41+
throw Exception(
42+
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
43+
"Illegal type {} of argument of function {}",
44+
arguments[0].type->getName(),
45+
getName());
46+
47+
if constexpr (std::is_same_v<ResultType, UInt16>)
48+
return std::make_shared<DataTypeUInt16>();
49+
else if constexpr (std::is_same_v<ResultType, Float64>)
50+
return std::make_shared<DataTypeFloat64>();
51+
}
52+
53+
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
54+
{
55+
const ColumnPtr column = arguments[0].column;
56+
const ColumnString * col_str = checkAndGetColumn<ColumnString>(column.get());
57+
58+
if (!col_str)
59+
throw Exception(
60+
ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
61+
62+
auto col_res = ColumnVector<ResultType>::create();
63+
auto & vec_res = col_res->getData();
64+
vec_res.resize(input_rows_count);
65+
66+
const ColumnString::Chars & data = col_str->getChars();
67+
const ColumnString::Offsets & offsets = col_str->getOffsets();
68+
69+
size_t prev_offset = 0;
70+
for (size_t i = 0; i < input_rows_count; ++i)
71+
{
72+
const UInt8 * data_ptr = data.data() + prev_offset;
73+
const size_t size = offsets[i] - prev_offset - 1;
74+
75+
vec_res[i] = Impl::process(data_ptr, size);
76+
prev_offset = offsets[i];
77+
}
78+
79+
return col_res;
80+
}
81+
};
82+
83+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#include <Columns/ColumnConst.h>
2+
#include <Columns/ColumnString.h>
3+
#include <Columns/ColumnVector.h>
4+
#include <DataTypes/DataTypeString.h>
5+
#include <DataTypes/DataTypesNumber.h>
6+
#include <Functions/FunctionFactory.h>
7+
#include <Functions/IFunction.h>
8+
#include <Functions/stringBytes.h>
9+
#include <Common/BitHelpers.h>
10+
#include <Common/PODArray.h>
11+
12+
#include <cmath>
13+
14+
namespace DB
15+
{
16+
17+
struct StringBytesEntropyImpl
18+
{
19+
using ResultType = Float64;
20+
21+
static ResultType process(const UInt8 * data, size_t size)
22+
{
23+
if (size == 0)
24+
return 0;
25+
26+
std::array<UInt32, 256> counters{};
27+
const UInt8 * end = data + size;
28+
29+
for (; data < end; ++data)
30+
counters[*data]++;
31+
32+
Float64 entropy = 0.0;
33+
34+
for (size_t byte = 0; byte < 256; ++byte)
35+
{
36+
UInt32 count = counters[byte];
37+
if (count > 0)
38+
{
39+
Float64 p = static_cast<Float64>(count) / size;
40+
entropy -= p * std::log2(p);
41+
}
42+
}
43+
44+
return entropy;
45+
}
46+
};
47+
48+
struct NameStringBytesEntropy
49+
{
50+
static constexpr auto name = "stringBytesEntropy";
51+
};
52+
53+
using FunctionStringBytesEntropy = FunctionStringBytes<StringBytesEntropyImpl, NameStringBytesEntropy>;
54+
55+
REGISTER_FUNCTION(StringBytesEntropy)
56+
{
57+
FunctionDocumentation::Description description = "Calculates Shannon's entropy of byte distribution in a string.";
58+
FunctionDocumentation::Syntax syntax = "stringBytesEntropy(s);";
59+
FunctionDocumentation::Arguments arguments = {
60+
{"s", "The string to analyze. [String](../../sql-reference/data-types/string.md))"}
61+
};
62+
FunctionDocumentation::ReturnedValue returned_value = "The Shannon entropy of the byte distribution. [Float64](../../sql-reference/data-types/float.md).";
63+
FunctionDocumentation::Examples examples = {
64+
{"Example",
65+
"SELECT stringBytesEntropy('Hello, world!');",
66+
"3.180832987205441"}
67+
};
68+
FunctionDocumentation::Category category = FunctionDocumentation::Category::String;
69+
70+
FunctionDocumentation function_documentation = {
71+
.description = description,
72+
.syntax = syntax,
73+
.arguments = arguments,
74+
.returned_value = returned_value,
75+
.examples = examples,
76+
.category = category
77+
};
78+
79+
factory.registerFunction<FunctionStringBytesEntropy>(function_documentation);
80+
}
81+
82+
}

src/Functions/stringBytesUniq.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#include <Columns/ColumnConst.h>
2+
#include <Columns/ColumnString.h>
3+
#include <Columns/ColumnVector.h>
4+
#include <DataTypes/DataTypeString.h>
5+
#include <DataTypes/DataTypesNumber.h>
6+
#include <Functions/FunctionFactory.h>
7+
#include <Functions/stringBytes.h>
8+
#include <Functions/IFunction.h>
9+
#include <Common/BitHelpers.h>
10+
#include <Common/PODArray.h>
11+
12+
#include <bit>
13+
#include <cmath>
14+
15+
namespace DB
16+
{
17+
18+
struct StringBytesUniqImpl
19+
{
20+
using ResultType = UInt16;
21+
22+
static ResultType process(const UInt8 * data, size_t size)
23+
{
24+
UInt64 mask[4] = {0};
25+
const UInt8 * end = data + size;
26+
27+
for (; data < end; ++data)
28+
{
29+
UInt8 byte = *data;
30+
mask[byte >> 6] |= (1ULL << (byte & 0x3F));
31+
}
32+
33+
return std::popcount(mask[0]) + std::popcount(mask[1]) + std::popcount(mask[2]) + std::popcount(mask[3]);
34+
}
35+
};
36+
37+
38+
struct NameStringBytesUniq
39+
{
40+
static constexpr auto name = "stringBytesUniq";
41+
};
42+
43+
44+
using FunctionStringBytesUniq = FunctionStringBytes<StringBytesUniqImpl, NameStringBytesUniq>;
45+
46+
REGISTER_FUNCTION(StringBytesUniq)
47+
{
48+
FunctionDocumentation::Description description = "Counts the number of distinct bytes in a string.";
49+
FunctionDocumentation::Syntax syntax = "stringBytesUniq(s);";
50+
FunctionDocumentation::Arguments arguments = {
51+
{"s", "The string to analyze. [String](../../sql-reference/data-types/string.md)"}
52+
};
53+
FunctionDocumentation::ReturnedValue returned_value = "The number of distinct bytes in the string. [UInt16](../../sql-reference/data-types/int-uint.md).";
54+
FunctionDocumentation::Examples examples = {
55+
{"Example",
56+
"SELECT stringBytesUniq('Hello, world!');",
57+
"10"}
58+
};
59+
FunctionDocumentation::Category category = FunctionDocumentation::Category::String;
60+
61+
FunctionDocumentation function_documentation = {
62+
.description = description,
63+
.syntax = syntax,
64+
.arguments = arguments,
65+
.returned_value = returned_value,
66+
.examples = examples,
67+
.category = category
68+
};
69+
70+
factory.registerFunction<FunctionStringBytesUniq>(function_documentation);
71+
}
72+
73+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<test>
2+
<create_query>DROP TABLE IF EXISTS test_string_bytes</create_query>
3+
<create_query>CREATE TABLE test_string_bytes(s String) ENGINE Memory</create_query>
4+
5+
<fill_query>INSERT INTO test_string_bytes SELECT reinterpretAsString(arrayJoin(range(0, 256))) FROM numbers(1000000)</fill_query>
6+
7+
<query>SELECT sum(stringBytesUniq(s)) FROM test_string_bytes FORMAT Null</query>
8+
<query>SELECT sum(stringBytesEntropy(s)) FROM test_string_bytes FORMAT Null</query>
9+
10+
<drop_query>DROP TABLE IF EXISTS test_string_bytes</drop_query>
11+
</test>

0 commit comments

Comments
 (0)