Merge pull request ClickHouse#80246 from Blargian/array_functions_part_6

rschu1ze · web-flow · commit 567df364929b · 2025-05-19T15:10:21.000Z
Docs: array functions source code documentation - part 6
diff --git a/src/Functions/array/arrayAUC.cpp b/src/Functions/array/arrayAUC.cpp
@@ -492,11 +492,82 @@ class FunctionArrayAUC : public IFunction
 REGISTER_FUNCTION(ArrayAUC)
 {
     /// ROC AUC
-    factory.registerFunction<FunctionArrayAUC<false>>();
+    FunctionDocumentation::Description description_roc = R"(
+Calculates the area under the receiver operating characteristic (ROC) curve.
+A ROC curve is created by plotting True Positive Rate (TPR) on the y-axis and False Positive Rate (FPR) on the x-axis across all thresholds.
+The resulting value ranges from zero to one, with a higher value indicating better model performance.
+
+The ROC AUC (also known as simply AUC) is a concept in machine learning.
+For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
+)";
+    FunctionDocumentation::Syntax syntax_roc = "arrayROCAUC(scores, labels[, scale[, partial_offsets]])";
+    FunctionDocumentation::Arguments arguments_roc = {
+        {"scores", "Scores prediction model gives. [`Array(T)`](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
+        {"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
+        {"scale", "Decides whether to return the normalized area. If false, returns the area under the TP (true positives) x FP (false positives) curve instead. Default value: true. [Bool](../data-types/boolean.md). Optional."},
+        {"partial_offsets", R"(
+- An array of four non-negative integers for calculating a partial area under the ROC curve (equivalent to a vertical band of the ROC space) instead of the whole AUC. This option is useful for distributed computation of the ROC AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`, `total_negatives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
+    - `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
+    - `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
+    - `total_positives`: The total number of positive samples in the entire dataset.
+    - `total_negatives`: The total number of negative samples in the entire dataset.
+
+::::note
+When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
+The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
+For example:
+- One partition could contain all scores in the range [0, 0.5).
+- Another partition could contain scores in the range [0.5, 1.0].
+::::
+)"}
+    };
+    FunctionDocumentation::ReturnedValue returned_value_roc = "Returns area under the receiver operating characteristic (ROC) curve. [Float64](../data-types/float.md).";
+    FunctionDocumentation::Examples examples_roc = {{"Usage example", "SELECT arrayROCAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", "0.75"}};
+    FunctionDocumentation::IntroducedIn introduced_in_roc = {20, 4};
+    FunctionDocumentation::Category category_roc = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation_roc = {description_roc, syntax_roc, arguments_roc, returned_value_roc, examples_roc, introduced_in_roc, category_roc};
+
+    factory.registerFunction<FunctionArrayAUC<false>>(documentation_roc);
     factory.registerAlias("arrayAUC", "arrayROCAUC"); /// Backward compatibility, also ROC AUC is often shorted to just AUC
 
     /// PR AUC
-    factory.registerFunction<FunctionArrayAUC<true>>();
+    FunctionDocumentation::Description description_pr = R"(
+Calculates the area under the precision-recall (PR) curve.
+A precision-recall curve is created by plotting precision on the y-axis and recall on the x-axis across all thresholds.
+The resulting value ranges from 0 to 1, with a higher value indicating better model performance.
+The PR AUC is particularly useful for imbalanced datasets, providing a clearer comparison of performance compared to ROC AUC on those cases.
+For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
+)";
+    FunctionDocumentation::Syntax syntax_pr = "arrayAUCPR(scores, labels[, partial_offsets])";
+    FunctionDocumentation::Arguments arguments_pr = {
+        {"cores", "Scores prediction model gives. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
+        {"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
+        {"partial_offsets", R"(
+- Optional. An [`Array(T)`](/sql-reference/data-types/array) of three non-negative integers for calculating a partial area under the PR curve (equivalent to a vertical band of the PR space) instead of the whole AUC. This option is useful for distributed computation of the PR AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
+    - `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
+    - `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
+    - `total_positives`: The total number of positive samples in the entire dataset.
+
+::::note
+When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
+The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
+For example:
+- One partition could contain all scores in the range [0, 0.5).
+- Another partition could contain scores in the range [0.5, 1.0].
+::::
+)"}
+    };
+    FunctionDocumentation::ReturnedValue returned_value_pr = "Returns area under the precision-recall (PR) curve. [Float64](../data-types/float.md).";
+    FunctionDocumentation::Examples examples_pr = {{"Usage example", "SELECT arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", R"(
+┌─arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
+│                              0.8333333333333333 │
+└─────────────────────────────────────────────────┘
+)"}};
+    FunctionDocumentation::IntroducedIn introduced_in_pr = {20, 4};
+    FunctionDocumentation::Category category_pr = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation_pr = {description_pr, syntax_pr, arguments_pr, returned_value_pr, examples_pr, introduced_in_pr, category_pr};
+
+    factory.registerFunction<FunctionArrayAUC<true>>(documentation_pr);
     factory.registerAlias("arrayPRAUC", "arrayAUCPR");
 }
 
diff --git a/src/Functions/array/arrayFill.cpp b/src/Functions/array/arrayFill.cpp
@@ -128,8 +128,55 @@ using FunctionArrayReverseFill = FunctionArrayMapped<ArrayFillImpl<true>, NameAr
 
 REGISTER_FUNCTION(ArrayFill)
 {
-    factory.registerFunction<FunctionArrayFill>();
-    factory.registerFunction<FunctionArrayReverseFill>();
+    FunctionDocumentation::Description description = R"(
+The `arrayFill` function sequentially processes a source array from the first element
+to the last, evaluating a lambda condition at each position using elements from
+the source and condition arrays. When the lambda function evaluates to false at
+position i, the function replaces that element with the element at position i-1
+from the current state of the array. The first element is always preserved
+regardless of any condition.
+)";
+    FunctionDocumentation::Syntax syntax = "arrayFill(func(x [, y1, ..., yN]), source [, cond1, ... , condN])";
+    FunctionDocumentation::Arguments arguments = {
+        {"func(x [, y1, ..., yN])", "A lambda function `func(x [, y1, y2, ... yN]) → F(x [, y1, y2, ... yN])` which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
+        {"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
+        {"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
+    };
+    FunctionDocumentation::ReturnedValue returned_value = "Returns an array. [`Array(T)`](/sql-reference/data-types/array).";
+    FunctionDocumentation::Examples examples = {
+        {"Example with single array", "SELECT arrayFill(x -> not isNull(x), [1, null, 2, null]) AS res", "[1,1,2,2]"},
+        {"Example with two arrays", "SELECT arrayFill(x, y, z -> x > y AND x < z, [5, 3, 6, 2], [4, 7, 1, 3], [10, 2, 8, 5]) AS res", "[5,5,6,6]"}
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction<FunctionArrayFill>(documentation);
+
+    FunctionDocumentation::Description description_reverse = R"(
+The `arrayReverseFill` function sequentially processes a source array from the last
+element to the first, evaluating a lambda condition at each position using elements
+from the source and condition arrays. When the condition evaluates to false at
+position i, the function replaces that element with the element at position i+1
+from the current state of the array. The last element is always preserved
+regardless of any condition.
+    )";
+    FunctionDocumentation::Syntax syntax_reverse = "arrayReverseFill(func(x[, y1, ..., yN]), source[, cond1, ... , condN])";
+    FunctionDocumentation::Arguments arguments_reverse = {
+        {"func(x[, y1, ..., yN])", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
+        {"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
+        {"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
+    };
+    FunctionDocumentation::ReturnedValue returned_value_reverse = "Returns an array with elements of the source array replaced by the results of the lambda. [`Array(T)`](/sql-reference/data-types/array).";
+    FunctionDocumentation::Examples examples_reverse = {
+        {"Example with a single array", "SELECT arrayReverseFill(x -> not isNull(x), [1, null, 2, null]) AS res", "[1,2,2,NULL]"},
+        {"Example with two arrays", "SELECT arrayReverseFill(x, y, z -> x > y AND x < z, [5, 3, 6, 2], [4, 7, 1, 3], [10, 2, 8, 5]) AS res;", "[5,6,6,2]"}
+    };
+    FunctionDocumentation::IntroducedIn introduced_in_reverse = {20, 1};
+    FunctionDocumentation::Category category_reverse = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation_reverse = {description_reverse, syntax_reverse, arguments_reverse, returned_value_reverse, examples_reverse, introduced_in_reverse, category_reverse};
+
+    factory.registerFunction<FunctionArrayReverseFill>(documentation_reverse);
 }
 
 }
diff --git a/src/Functions/array/arrayFilter.cpp b/src/Functions/array/arrayFilter.cpp
@@ -49,7 +49,31 @@ ColumnPtr ArrayFilterImpl::execute(const ColumnArray & array, ColumnPtr mapped)
 
 REGISTER_FUNCTION(ArrayFilter)
 {
-    factory.registerFunction<FunctionArrayFilter>();
+    FunctionDocumentation::Description description = "Returns an array containing only the elements in the source array for which a lambda function returns something other than `0`.";
+    FunctionDocumentation::Syntax syntax = "arrayFilter(func(x[, y1, ..., yN]), source[, cond1, ... , condN])]";
+    FunctionDocumentation::Arguments arguments = {
+        {"func(x[, y1, ..., yN])", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
+        {"source", "The source array to process [`Array(T)`](/sql-reference/data-types/array)."},
+        {"[, cond1, ... , condN]", "Optional. N condition arrays providing additional arguments to the lambda function. [`Array(T)`](/sql-reference/data-types/array)."},
+    };
+    FunctionDocumentation::ReturnedValue returned_value = "Returns a subset of the source array. [`Array(T)`](/sql-reference/data-types/array).";
+    FunctionDocumentation::Examples examples = {
+        {"Example 1", "SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res", "['abc World']"},
+        {"Example 2", R"(
+SELECT
+    arrayFilter(
+        (i, x) -> x LIKE '%World%',
+        arrayEnumerate(arr),
+        ['Hello', 'abc World'] AS arr)
+    AS res
+)",
+"[2]"}
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction<FunctionArrayFilter>(documentation);
 }
 
 }
diff --git a/src/Functions/array/arrayMap.cpp b/src/Functions/array/arrayMap.cpp
@@ -6,7 +6,24 @@ namespace DB
 
 REGISTER_FUNCTION(ArrayMap)
 {
-    factory.registerFunction<FunctionArrayMap>();
+    FunctionDocumentation::Description description = R"(
+Returns an array obtained from the original arrays by applying a lambda function to each element.
+)";
+    FunctionDocumentation::Syntax syntax = "arrayMap(func, arr)";
+    FunctionDocumentation::Arguments arguments = {
+        {"func", "A lambda function which operates on elements of the source array (`x`) and condition arrays (`y`). [Lambda function](/sql-reference/functions/overview#arrow-operator-and-lambda)."},
+        {"arr", "N arrays to process. [Array(T)](/sql-reference/data-types/array)."},
+    };
+    FunctionDocumentation::ReturnedValue returned_value = "Returns an array from the lambda results. [`Array(T)`](/sql-reference/data-types/array)";
+    FunctionDocumentation::Examples examples = {
+        {"Usage example", "SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res;", "[3,4,5]"},
+        {"Creating a tuple of elements from different arrays", "SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res", "[(1,4),(2,5),(3,6)]"}
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::Array;
+    FunctionDocumentation documentation = {description, syntax, arguments, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction<FunctionArrayMap>(documentation);
 }
 
 }
diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
@@ -88,7 +88,6 @@ age
 alphaTokens
 and
 appendTrailingCharIfAbsent
-arrayAUCPR
 arrayAll
 arrayAvg
 arrayConcat
@@ -101,21 +100,16 @@ arrayEnumerateDense
 arrayEnumerateDenseRanked
 arrayEnumerateUniq
 arrayExists
-arrayFill
-arrayFilter
 arrayFirst
 arrayFirstIndex
 arrayFirstOrNull
 arrayLast
 arrayLastIndex
 arrayLastOrNull
-arrayMap
 arrayMax
 arrayMin
 arrayProduct
-arrayROCAUC
 arrayRandomSample
-arrayReverseFill
 arrayReverseSplit
 arraySplit
 arrayStringConcat
diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_have_version_information.reference b/tests/queries/0_stateless/02415_all_new_functions_must_have_version_information.reference
@@ -104,7 +104,6 @@ age
 alphaTokens
 and
 appendTrailingCharIfAbsent
-arrayAUCPR
 arrayAll
 arrayAvg
 arrayConcat
@@ -117,8 +116,6 @@ arrayEnumerateDense
 arrayEnumerateDenseRanked
 arrayEnumerateUniq
 arrayExists
-arrayFill
-arrayFilter
 arrayFirst
 arrayFirstIndex
 arrayFirstOrNull
@@ -127,13 +124,10 @@ arrayLastIndex
 arrayLastOrNull
 arrayLevenshteinDistance
 arrayLevenshteinDistanceWeighted
-arrayMap
 arrayMax
 arrayMin
 arrayProduct
-arrayROCAUC
 arrayRandomSample
-arrayReverseFill
 arrayReverseSplit
 arrayRotateLeft
 arrayRotateRight