9
9
)
10
10
from package_parser .processing .api .model import API , Parameter , ParameterAssignment
11
11
from package_parser .processing .usages .model import UsageCountStore
12
+ from scipy .stats import binom
12
13
13
14
from ._constants import autogen_author
14
15
@@ -84,45 +85,12 @@ def _generate_required_or_optional_annotation(
84
85
85
86
# Compute metrics
86
87
most_common_value_count = usages .n_value_usages (parameter .id , most_common_values [0 ])
87
-
88
- # We deliberately don't ensure this is a literal. Otherwise, we might make a parameter optional even though there is
89
- # a tie between the most common value and the second most common value if the latter is not a literal. This would
90
- # also mean different annotations would be generated depending on the order the values were inserted into the
91
- # UsageCountStore since the counts are identical.
92
- #
93
- # Example:
94
- # Values: "1" used 5 times, "call()" used 5 times
95
- #
96
- # If we now treat "1" as the most common value, we would make the parameter optional:
97
- # - most_common_value_count = 5
98
- # - second_most_common_value_count = 0 ("call()" is not a literal, so we would skip it and default to 0)
99
- # - total_literal_value_count = 5
100
- # - n_different_literal_values = 1
101
- # - (5 - 0) >= (5 / 1)
102
- #
103
- # However, if we treat "call()" as the most common value, we would make the parameter required since it is not a
104
- # literal.
105
88
second_most_common_value_count = usages .n_value_usages (
106
89
parameter .id , most_common_values [1 ]
107
90
)
108
91
109
- literal_values = [
110
- stringified_value
111
- for stringified_value in most_common_values
112
- if _is_stringified_literal (stringified_value )
113
- ]
114
- total_literal_value_count = sum (
115
- [usages .n_value_usages (parameter .id , value ) for value in literal_values ]
116
- )
117
- n_different_literal_values = len (literal_values )
118
-
119
92
# Add appropriate annotation
120
- if _should_be_required (
121
- most_common_value_count ,
122
- second_most_common_value_count ,
123
- total_literal_value_count ,
124
- n_different_literal_values ,
125
- ):
93
+ if _should_be_required (most_common_value_count , second_most_common_value_count ):
126
94
annotations .valueAnnotations .append (
127
95
RequiredAnnotation (
128
96
target = parameter .id , authors = [autogen_author ], reviewers = []
@@ -132,7 +100,7 @@ def _generate_required_or_optional_annotation(
132
100
(
133
101
default_value_type ,
134
102
default_value ,
135
- ) = _get_type_and_value_for_stringified_value (literal_values [0 ])
103
+ ) = _get_type_and_value_for_stringified_value (most_common_values [0 ])
136
104
if default_value_type is not None : # Just for mypy, always true
137
105
annotations .valueAnnotations .append (
138
106
OptionalAnnotation (
@@ -146,27 +114,38 @@ def _generate_required_or_optional_annotation(
146
114
147
115
148
116
def _should_be_required (
149
- most_common_value_count : int ,
150
- second_most_common_value_count : int ,
151
- total_literal_value_count : int ,
152
- n_different_literal_values : int ,
117
+ most_common_value_count : int , second_most_common_value_count : int
153
118
) -> bool :
154
119
"""
155
120
This function determines how to differentiate between an optional and a required parameter
156
- :param most_common_value_count: The number of times the most common value is used
157
- :param second_most_common_value_count: The number of times the second most common value is used
158
- :param total_literal_value_count: The total number of times the parameter is set to a literal value
159
- :param n_different_literal_values: The number of different literal values that are used
121
+ :param most_common_value_count: How often the most common value is used
122
+ :param second_most_common_value_count: How often the second most common value is used
160
123
:return: True means the parameter should be required, False means it should be optional
161
124
"""
162
125
163
- # Most common value is the only literal value
164
- if n_different_literal_values == 1 :
165
- return False
126
+ # Shortcut to speed up the check
127
+ if most_common_value_count == second_most_common_value_count :
128
+ return True
166
129
130
+ # Precaution to ensure proper order of most_common_value_count and second_most_common_value_count
131
+ if second_most_common_value_count > most_common_value_count :
132
+ most_common_value_count , second_most_common_value_count = (
133
+ second_most_common_value_count ,
134
+ most_common_value_count ,
135
+ )
136
+
137
+ total = most_common_value_count + second_most_common_value_count
138
+
139
+ # Our null hypothesis is that the user chooses between the most common and second most common value by a fair coin
140
+ # toss. Unless this hypothesis is rejected, we make the parameter required. We reject the hypothesis if the p-value
141
+ # is less than or equal to 5%. The p-value is the probability that we observe results that are at least as extreme
142
+ # as the values we observed, assuming the null hypothesis is true.
167
143
return (
168
- most_common_value_count - second_most_common_value_count
169
- < total_literal_value_count / n_different_literal_values
144
+ 2
145
+ * sum (
146
+ binom .pmf (i , total , 0.5 ) for i in range (most_common_value_count , total + 1 )
147
+ )
148
+ > 0.05
170
149
)
171
150
172
151
0 commit comments