Skip to content

Commit 1c90b24

Browse files
committed
Add documentation to source code
1 parent 0b909b0 commit 1c90b24

File tree

3 files changed

+73
-6
lines changed

3 files changed

+73
-6
lines changed

src/Functions/array/arrayAUC.cpp

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,11 +492,82 @@ class FunctionArrayAUC : public IFunction
492492
REGISTER_FUNCTION(ArrayAUC)
493493
{
494494
/// ROC AUC
495-
factory.registerFunction<FunctionArrayAUC<false>>();
495+
FunctionDocumentation::Description description_roc = R"(
496+
Calculates the area under the receiver operating characteristic (ROC) curve.
497+
A ROC curve is created by plotting True Positive Rate (TPR) on the y-axis and False Positive Rate (FPR) on the x-axis across all thresholds.
498+
The resulting value ranges from zero to one, with a higher value indicating better model performance.
499+
500+
The ROC AUC (also known as simply AUC) is a concept in machine learning.
501+
For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
502+
)";
503+
FunctionDocumentation::Syntax syntax_roc = "arrayROCAUC(scores, labels[, scale[, partial_offsets]])";
504+
FunctionDocumentation::Arguments arguments_roc = {
505+
{"scores", "Scores prediction model gives. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
506+
{"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
507+
{"scale", "Decides whether to return the normalized area. If false, returns the area under the TP (true positives) x FP (false positives) curve instead. Default value: true. [Bool](../data-types/boolean.md). Optional."}
508+
{"partial_offsets", R"(
509+
- An array of four non-negative integers for calculating a partial area under the ROC curve (equivalent to a vertical band of the ROC space) instead of the whole AUC. This option is useful for distributed computation of the ROC AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`, `total_negatives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
510+
- `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
511+
- `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
512+
- `total_positives`: The total number of positive samples in the entire dataset.
513+
- `total_negatives`: The total number of negative samples in the entire dataset.
514+
515+
::::note
516+
When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
517+
The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
518+
For example:
519+
- One partition could contain all scores in the range [0, 0.5).
520+
- Another partition could contain scores in the range [0.5, 1.0].
521+
::::
522+
)"}
523+
};
524+
FunctionDocumentation::ReturnedValue returned_value_roc = "Returns area under the receiver operating characteristic (ROC) curve. [Float64](../data-types/float.md).";
525+
FunctionDocumentation::Examples examples_roc = {{"Usage example", "SELECT arrayROCAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", "0.75"}};
526+
FunctionDocumentation::IntroducedIn introduced_in_roc = {20, 4};
527+
FunctionDocumentation::Category category_roc = FunctionDocumentation::Category::Array;
528+
FunctionDocumentation documentation_roc = {description_roc, syntax_roc, arguments_roc, returned_value_roc, examples_roc, introduced_in_roc, category_roc};
529+
530+
factory.registerFunction<FunctionArrayAUC<false>>(documentation_roc);
496531
factory.registerAlias("arrayAUC", "arrayROCAUC"); /// Backward compatibility, also ROC AUC is often shorted to just AUC
497532

498533
/// PR AUC
499-
factory.registerFunction<FunctionArrayAUC<true>>();
534+
FunctionDocumentation::Description description_pr = R"(
535+
Calculates the area under the precision-recall (PR) curve.
536+
A precision-recall curve is created by plotting precision on the y-axis and recall on the x-axis across all thresholds.
537+
The resulting value ranges from 0 to 1, with a higher value indicating better model performance.
538+
The PR AUC is particularly useful for imbalanced datasets, providing a clearer comparison of performance compared to ROC AUC on those cases.
539+
For more details, please see [here](https://developers.google.com/machine-learning/glossary#pr-auc-area-under-the-pr-curve), [here](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#expandable-1) and [here](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
540+
)";
541+
FunctionDocumentation::Syntax syntax_pr = "arrayAUCPR(scores, labels[, partial_offsets])";
542+
FunctionDocumentation::Arguments arguments_pr = {
543+
{"cores", "Scores prediction model gives. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Floats](../data-types/float.md)."},
544+
{"labels", "Labels of samples, usually 1 for positive sample and 0 for negative sample. [Array](/sql-reference/data-types/array) of [Integers](../data-types/int-uint.md) or [Enums](../data-types/enum.md)."},
545+
{"partial_offsets", R"(
546+
- Optional. An [Array](/sql-reference/data-types/array) of three non-negative integers for calculating a partial area under the PR curve (equivalent to a vertical band of the PR space) instead of the whole AUC. This option is useful for distributed computation of the PR AUC. The array must contain the following elements [`higher_partitions_tp`, `higher_partitions_fp`, `total_positives`]. [Array](/sql-reference/data-types/array) of non-negative [Integers](../data-types/int-uint.md). Optional.
547+
- `higher_partitions_tp`: The number of positive labels in the higher-scored partitions.
548+
- `higher_partitions_fp`: The number of negative labels in the higher-scored partitions.
549+
- `total_positives`: The total number of positive samples in the entire dataset.
550+
551+
::::note
552+
When `arr_partial_offsets` is used, the `arr_scores` and `arr_labels` should be only a partition of the entire dataset, containing an interval of scores.
553+
The dataset should be divided into contiguous partitions, where each partition contains the subset of the data whose scores fall within a specific range.
554+
For example:
555+
- One partition could contain all scores in the range [0, 0.5).
556+
- Another partition could contain scores in the range [0.5, 1.0].
557+
::::
558+
)"}
559+
};
560+
FunctionDocumentation::ReturnedValue returned_value_pr = "Returns area under the precision-recall (PR) curve. [Float64](../data-types/float.md).";
561+
FunctionDocumentation::Examples examples_pr = {{"Usage example", "SELECT arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]);", R"(
562+
┌─arrayAUCPR([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐
563+
│ 0.8333333333333333 │
564+
└─────────────────────────────────────────────────┘
565+
)"}};
566+
FunctionDocumentation::IntroducedIn introduced_in_pr = {20, 4};
567+
FunctionDocumentation::Category category_pr = FunctionDocumentation::Category::Array;
568+
FunctionDocumentation documentation_pr = {description_pr, syntax_pr, arguments_pr, returned_value_pr, examples_pr, introduced_in_pr, category_pr};
569+
570+
factory.registerFunction<FunctionArrayAUC<true>>(documentation_pr);
500571
factory.registerAlias("arrayPRAUC", "arrayAUCPR");
501572
}
502573

tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ alphaTokens
8989
and
9090
appendTrailingCharIfAbsent
9191
array
92-
arrayAUCPR
9392
arrayAll
9493
arrayAvg
9594
arrayCompact
@@ -125,7 +124,6 @@ arrayPopFront
125124
arrayProduct
126125
arrayPushBack
127126
arrayPushFront
128-
arrayROCAUC
129127
arrayRandomSample
130128
arrayReduceInRanges
131129
arrayResize

tests/queries/0_stateless/02415_all_new_functions_must_have_version_information.reference

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ alphaTokens
105105
and
106106
appendTrailingCharIfAbsent
107107
array
108-
arrayAUCPR
109108
arrayAll
110109
arrayAvg
111110
arrayCompact
@@ -149,7 +148,6 @@ arrayPopFront
149148
arrayProduct
150149
arrayPushBack
151150
arrayPushFront
152-
arrayROCAUC
153151
arrayRandomSample
154152
arrayReduceInRanges
155153
arrayResize

0 commit comments

Comments
 (0)