Skip to content

Commit 567df36

Browse files
authored
Merge pull request ClickHouse#80246 from Blargian/array_functions_part_6
Docs: array functions source code documentation - part 6
2 parents 91c76ab + c850d03 commit 567df36

File tree

6 files changed

+165
-18
lines changed

6 files changed

+165
-18
lines changed

src/Functions/array/arrayAUC.cpp

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,11 +492,82 @@ class FunctionArrayAUC : public IFunction
492492
REGISTER_FUNCTION(ArrayAUC)
493493
{
494494
/// ROC AUC
495-
factory.registerFunction<FunctionArrayAUC<false>>();
495+
FunctionDocumentation::Description description_roc = R"(
496+
Calculates the area under the receiver operating characteristic (ROC) curve.
497+
A ROC curve is created by plotting True Positive Rate (TPR) on the y-axis and False Positive Rate (FPR) on the x-axis across all thresholds.
498+
The resulting value ranges from zero to one, with a higher value indicating better model performance.
499+
500+
The ROC AUC (also known as simply AUC) is a concept in machine learning.
501+
For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
502+
)";
503+
FunctionDocumentation::Syntax syntax_roc = "arrayROCAUC(scores, labels[, scale[, partial_offsets]])";
504+
FunctionDocumentation::Arguments arguments_roc = {
505+
{"scores", "Scores prediction model gives. [`Array(T)`](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
506+
{"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
507+
{"scale", "Decides whether to return the normalized area. If false, returns the area under the TP (true positives) x FP (false positives) curve instead. Default value: true. [Bool](../data-types/boolean.md). Optional."},
508+
{"partial_offsets", R"(
509+
- An array of four non-negative integers for calculating a partial area under the ROC curve (equivalent to a vertical band of the ROC space) instead of the whole AUC. This option is useful for distributed computation of the ROC AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`, `total_negatives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
510+
- `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
511+
- `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
512+
- `total_positives`: The total number of positive samples in the entire dataset.
513+
- `total_negatives`: The total number of negative samples in the entire dataset.
514+
515+
::::note
516+
When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
517+
The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
518+
For example:
519+
- One partition could contain all scores in the range [0, 0.5).
520+
- Another partition could contain scores in the range [0.5, 1.0].
521+
::::
522+
)"}
523+
};
524+
FunctionDocumentation::ReturnedValue returned_value_roc = "Returns area under the receiver operating characteristic (ROC) curve. [Float64](../data-types/float.md).";
525+
FunctionDocumentation::Examples examples_roc = {{"Usage example", "SELECT arrayROCAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", "0.75"}};
526+
FunctionDocumentation::IntroducedIn introduced_in_roc = {20, 4};
527+
FunctionDocumentation::Category category_roc = FunctionDocumentation::Category::Array;
528+
FunctionDocumentation documentation_roc = {description_roc, syntax_roc, arguments_roc, returned_value_roc, examples_roc, introduced_in_roc, category_roc};
529+
530+
factory.registerFunction<FunctionArrayAUC<false>>(documentation_roc);
496531
factory.registerAlias("arrayAUC", "arrayROCAUC"); /// Backward compatibility, also ROC AUC is often shorted to just AUC
497532

498533
/// PR AUC
499-
factory.registerFunction<FunctionArrayAUC<true>>();
534+
FunctionDocumentation::Description description_pr = R"(
535+
Calculates the area under the precision-recall (PR) curve.
536+
A precision-recall curve is created by plotting precision on the y-axis and recall on the x-axis across all thresholds.
537+
The resulting value ranges from 0 to 1, with a higher value indicating better model performance.
538+
The PR AUC is particularly useful for imbalanced datasets, providing a clearer comparison of performance compared to ROC AUC on those cases.
539+
For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
540+
)";
541+
FunctionDocumentation::Syntax syntax_pr = "arrayAUCPR(scores, labels[, partial_offsets])";
542+
FunctionDocumentation::Arguments arguments_pr = {
543+
{"cores", "Scores prediction model gives. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
544+
{"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
545+
{"partial_offsets", R"(
546+
- Optional. An [`Array(T)`](/sql-reference/data-types/array) of three non-negative integers for calculating a partial area under the PR curve (equivalent to a vertical band of the PR space) instead of the whole AUC. This option is useful for distributed computation of the PR AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
547+
- `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
548+
- `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
549+
- `total_positives`: The total number of positive samples in the entire dataset.
550+
551+
::::note
552+
When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
553+
The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
554+
For example:
555+
- One partition could contain all scores in the range [0, 0.5).
556+
- Another partition could contain scores in the range [0.5, 1.0].
557+
::::
558+
)"}
559+
};
560+
FunctionDocumentation::ReturnedValue returned_value_pr = "Returns area under the precision-recall (PR) curve. [Float64](../data-types/float.md).";
561+
FunctionDocumentation::Examples examples_pr = {{"Usage example", "SELECT arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", R"(
562+
┌─arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
563+
│ 0.8333333333333333 │
564+
└─────────────────────────────────────────────────┘
565+
)"}};
566+
FunctionDocumentation::IntroducedIn introduced_in_pr = {20, 4};
567+
FunctionDocumentation::Category category_pr = FunctionDocumentation::Category::Array;
568+
FunctionDocumentation documentation_pr = {description_pr, syntax_pr, arguments_pr, returned_value_pr, examples_pr, introduced_in_pr, category_pr};
569+
570+
factory.registerFunction<FunctionArrayAUC<true>>(documentation_pr);
500571
factory.registerAlias("arrayPRAUC", "arrayAUCPR");
501572
}
502573

src/Functions/array/arrayFill.cpp

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,55 @@ using FunctionArrayReverseFill = FunctionArrayMapped<ArrayFillImpl<true>, NameAr
128128

129129
REGISTER_FUNCTION(ArrayFill)
130130
{
131-
factory.registerFunction<FunctionArrayFill>();
132-
factory.registerFunction<FunctionArrayReverseFill>();
131+
FunctionDocumentation::Description description = R"(
132+
The `arrayFill` function sequentially processes a source array from the first element
133+
to the last, evaluating a lambda condition at each position using elements from
134+
the source and condition arrays. When the lambda function evaluates to false at
135+
position i, the function replaces that element with the element at position i-1
136+
from the current state of the array. The first element is always preserved
137+
regardless of any condition.
138+
)";
139+
FunctionDocumentation::Syntax syntax = "arrayFill(func(x [, y1, ..., yN]), source [, cond1, ... , condN])";
140+
FunctionDocumentation::Arguments arguments = {
141+
{"func(x [, y1, ..., yN])", "A lambda function `func(x [, y1, y2, ... yN]) → F(x [, y1, y2, ... yN])` which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
142+
{"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
143+
{"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
144+
};
145+
FunctionDocumentation::ReturnedValue returned_value = "Returns an array. [`Array(T)`](/sql-reference/data-types/array).";
146+
FunctionDocumentation::Examples examples = {
147+
{"Example with single array", "SELECT arrayFill(x -> not isNull(x), [1, null, 2, null]) AS res", "[1,1,2,2]"},
148+
{"Example with two arrays", "SELECT arrayFill(x, y, z -> x > y AND x < z, [5, 3, 6, 2], [4, 7, 1, 3], [10, 2, 8, 5]) AS res", "[5,5,6,6]"}
149+
};
150+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
151+
FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
152+
FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
153+
154+
factory.registerFunction<FunctionArrayFill>(documentation);
155+
156+
FunctionDocumentation::Description description_reverse = R"(
157+
The `arrayReverseFill` function sequentially processes a source array from the last
158+
element to the first, evaluating a lambda condition at each position using elements
159+
from the source and condition arrays. When the condition evaluates to false at
160+
position i, the function replaces that element with the element at position i+1
161+
from the current state of the array. The last element is always preserved
162+
regardless of any condition.
163+
)";
164+
FunctionDocumentation::Syntax syntax_reverse = "arrayReverseFill(func(x[, y1, ..., yN]), source[, cond1, ... , condN])";
165+
FunctionDocumentation::Arguments arguments_reverse = {
166+
{"func(x[, y1, ..., yN])", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
167+
{"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
168+
{"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
169+
};
170+
FunctionDocumentation::ReturnedValue returned_value_reverse = "Returns an array with elements of the source array replaced by the results of the lambda. [`Array(T)`](/sql-reference/data-types/array).";
171+
FunctionDocumentation::Examples examples_reverse = {
172+
{"Example with a single array", "SELECT arrayReverseFill(x -> not isNull(x), [1, null, 2, null]) AS res", "[1,2,2,NULL]"},
173+
{"Example with two arrays", "SELECT arrayReverseFill(x, y, z -> x > y AND x < z, [5, 3, 6, 2], [4, 7, 1, 3], [10, 2, 8, 5]) AS res;", "[5,6,6,2]"}
174+
};
175+
FunctionDocumentation::IntroducedIn introduced_in_reverse = {20, 1};
176+
FunctionDocumentation::Category category_reverse = FunctionDocumentation::Category::Array;
177+
FunctionDocumentation documentation_reverse = {description_reverse, syntax_reverse, arguments_reverse, returned_value_reverse, examples_reverse, introduced_in_reverse, category_reverse};
178+
179+
factory.registerFunction<FunctionArrayReverseFill>(documentation_reverse);
133180
}
134181

135182
}

src/Functions/array/arrayFilter.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,31 @@ ColumnPtr ArrayFilterImpl::execute(const ColumnArray & array, ColumnPtr mapped)
4949

5050
REGISTER_FUNCTION(ArrayFilter)
5151
{
52-
factory.registerFunction<FunctionArrayFilter>();
52+
FunctionDocumentation::Description description = "Returns an array containing only the elements in the source array for which a lambda function returns something other than `0`.";
53+
FunctionDocumentation::Syntax syntax = "arrayFilter(func(x[, y1, ..., yN]), source[, cond1, ... , condN])]";
54+
FunctionDocumentation::Arguments arguments = {
55+
{"func(x[, y1, ..., yN])", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
56+
{"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
57+
{"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
58+
};
59+
FunctionDocumentation::ReturnedValue returned_value = "Returns a subset of the source array. [`Array(T)`](/sql-reference/data-types/array).";
60+
FunctionDocumentation::Examples examples = {
61+
{"Example 1", "SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res", "['abc World']"},
62+
{"Example 2", R"(
63+
SELECT
64+
arrayFilter(
65+
(i, x) -> x LIKE '%World%',
66+
arrayEnumerate(arr),
67+
['Hello', 'abc World'] AS arr)
68+
AS res
69+
)",
70+
"[2]"}
71+
};
72+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
73+
FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
74+
FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
75+
76+
factory.registerFunction<FunctionArrayFilter>(documentation);
5377
}
5478

5579
}

src/Functions/array/arrayMap.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,24 @@ namespace DB
66

77
REGISTER_FUNCTION(ArrayMap)
88
{
9-
factory.registerFunction<FunctionArrayMap>();
9+
FunctionDocumentation::Description description = R"(
10+
Returns an array obtained from the original arrays by applying a lambda function to each element.
11+
)";
12+
FunctionDocumentation::Syntax syntax = "arrayMap(func, arr)";
13+
FunctionDocumentation::Arguments arguments = {
14+
{"func", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
15+
{"arr", "N arrays to process. [Array(T)](/sql-reference/data-types/array)."},
16+
};
17+
FunctionDocumentation::ReturnedValue returned_value = "Returns an array from the lambda results. [`Array(T)`](/sql-reference/data-types/array)";
18+
FunctionDocumentation::Examples examples = {
19+
{"Usage example", "SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res;", "[3,4,5]"},
20+
{"Creating a tuple of elements from different arrays", "SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res", "[(1,4),(2,5),(3,6)]"}
21+
};
22+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
23+
FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
24+
FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
25+
26+
factory.registerFunction<FunctionArrayMap>(documentation);
1027
}
1128

1229
}

tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ age
8888
alphaTokens
8989
and
9090
appendTrailingCharIfAbsent
91-
arrayAUCPR
9291
arrayAll
9392
arrayAvg
9493
arrayConcat
@@ -101,21 +100,16 @@ arrayEnumerateDense
101100
arrayEnumerateDenseRanked
102101
arrayEnumerateUniq
103102
arrayExists
104-
arrayFill
105-
arrayFilter
106103
arrayFirst
107104
arrayFirstIndex
108105
arrayFirstOrNull
109106
arrayLast
110107
arrayLastIndex
111108
arrayLastOrNull
112-
arrayMap
113109
arrayMax
114110
arrayMin
115111
arrayProduct
116-
arrayROCAUC
117112
arrayRandomSample
118-
arrayReverseFill
119113
arrayReverseSplit
120114
arraySplit
121115
arrayStringConcat

tests/queries/0_stateless/02415_all_new_functions_must_have_version_information.reference

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ age
104104
alphaTokens
105105
and
106106
appendTrailingCharIfAbsent
107-
arrayAUCPR
108107
arrayAll
109108
arrayAvg
110109
arrayConcat
@@ -117,8 +116,6 @@ arrayEnumerateDense
117116
arrayEnumerateDenseRanked
118117
arrayEnumerateUniq
119118
arrayExists
120-
arrayFill
121-
arrayFilter
122119
arrayFirst
123120
arrayFirstIndex
124121
arrayFirstOrNull
@@ -127,13 +124,10 @@ arrayLastIndex
127124
arrayLastOrNull
128125
arrayLevenshteinDistance
129126
arrayLevenshteinDistanceWeighted
130-
arrayMap
131127
arrayMax
132128
arrayMin
133129
arrayProduct
134-
arrayROCAUC
135130
arrayRandomSample
136-
arrayReverseFill
137131
arrayReverseSplit
138132
arrayRotateLeft
139133
arrayRotateRight

0 commit comments

Comments
 (0)