Skip to content
This repository was archived by the owner on Nov 23, 2024. It is now read-only.

Commit 6e1fd68

Browse files
authored
feat(parser): differentiate between required and optional parameters using statistical hypothesis testing (#913)
* test(parser): more test cases * feat(parser): differentiate between required and optional parameters using statistical hypothesis testing * chore(data): regenerate annotations * test(parser): update tests to new approach * build(parser): fix required python version * style: apply automatic fixes of linters Co-authored-by: lars-reimann <[email protected]>
1 parent 4cbcd55 commit 6e1fd68

File tree

6 files changed

+303
-165
lines changed

6 files changed

+303
-165
lines changed

package-parser/package_parser/processing/annotations/_generate_value_annotations.py

Lines changed: 27 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
)
1010
from package_parser.processing.api.model import API, Parameter, ParameterAssignment
1111
from package_parser.processing.usages.model import UsageCountStore
12+
from scipy.stats import binom
1213

1314
from ._constants import autogen_author
1415

@@ -84,45 +85,12 @@ def _generate_required_or_optional_annotation(
8485

8586
# Compute metrics
8687
most_common_value_count = usages.n_value_usages(parameter.id, most_common_values[0])
87-
88-
# We deliberately don't ensure this is a literal. Otherwise, we might make a parameter optional even though there is
89-
# a tie between the most common value and the second most common value if the latter is not a literal. This would
90-
# also mean different annotations would be generated depending on the order the values were inserted into the
91-
# UsageCountStore since the counts are identical.
92-
#
93-
# Example:
94-
# Values: "1" used 5 times, "call()" used 5 times
95-
#
96-
# If we now treat "1" as the most common value, we would make the parameter optional:
97-
# - most_common_value_count = 5
98-
# - second_most_common_value_count = 0 ("call()" is not a literal, so we would skip it and default to 0)
99-
# - total_literal_value_count = 5
100-
# - n_different_literal_values = 1
101-
# - (5 - 0) >= (5 / 1)
102-
#
103-
# However, if we treat "call()" as the most common value, we would make the parameter required since it is not a
104-
# literal.
10588
second_most_common_value_count = usages.n_value_usages(
10689
parameter.id, most_common_values[1]
10790
)
10891

109-
literal_values = [
110-
stringified_value
111-
for stringified_value in most_common_values
112-
if _is_stringified_literal(stringified_value)
113-
]
114-
total_literal_value_count = sum(
115-
[usages.n_value_usages(parameter.id, value) for value in literal_values]
116-
)
117-
n_different_literal_values = len(literal_values)
118-
11992
# Add appropriate annotation
120-
if _should_be_required(
121-
most_common_value_count,
122-
second_most_common_value_count,
123-
total_literal_value_count,
124-
n_different_literal_values,
125-
):
93+
if _should_be_required(most_common_value_count, second_most_common_value_count):
12694
annotations.valueAnnotations.append(
12795
RequiredAnnotation(
12896
target=parameter.id, authors=[autogen_author], reviewers=[]
@@ -132,7 +100,7 @@ def _generate_required_or_optional_annotation(
132100
(
133101
default_value_type,
134102
default_value,
135-
) = _get_type_and_value_for_stringified_value(literal_values[0])
103+
) = _get_type_and_value_for_stringified_value(most_common_values[0])
136104
if default_value_type is not None: # Just for mypy, always true
137105
annotations.valueAnnotations.append(
138106
OptionalAnnotation(
@@ -146,27 +114,38 @@ def _generate_required_or_optional_annotation(
146114

147115

148116
def _should_be_required(
149-
most_common_value_count: int,
150-
second_most_common_value_count: int,
151-
total_literal_value_count: int,
152-
n_different_literal_values: int,
117+
most_common_value_count: int, second_most_common_value_count: int
153118
) -> bool:
154119
"""
155120
This function determines how to differentiate between an optional and a required parameter
156-
:param most_common_value_count: The number of times the most common value is used
157-
:param second_most_common_value_count: The number of times the second most common value is used
158-
:param total_literal_value_count: The total number of times the parameter is set to a literal value
159-
:param n_different_literal_values: The number of different literal values that are used
121+
:param most_common_value_count: How often the most common value is used
122+
:param second_most_common_value_count: How often the second most common value is used
160123
:return: True means the parameter should be required, False means it should be optional
161124
"""
162125

163-
# Most common value is the only literal value
164-
if n_different_literal_values == 1:
165-
return False
126+
# Shortcut to speed up the check
127+
if most_common_value_count == second_most_common_value_count:
128+
return True
166129

130+
# Precaution to ensure proper order of most_common_value_count and second_most_common_value_count
131+
if second_most_common_value_count > most_common_value_count:
132+
most_common_value_count, second_most_common_value_count = (
133+
second_most_common_value_count,
134+
most_common_value_count,
135+
)
136+
137+
total = most_common_value_count + second_most_common_value_count
138+
139+
# Our null hypothesis is that the user chooses between the most common and second most common value by a fair coin
140+
# toss. Unless this hypothesis is rejected, we make the parameter required. We reject the hypothesis if the p-value
141+
# is less than or equal to 5%. The p-value is the probability that we observe results that are at least as extreme
142+
# as the values we observed, assuming the null hypothesis is true.
167143
return (
168-
most_common_value_count - second_most_common_value_count
169-
< total_literal_value_count / n_different_literal_values
144+
2
145+
* sum(
146+
binom.pmf(i, total, 0.5) for i in range(most_common_value_count, total + 1)
147+
)
148+
> 0.05
170149
)
171150

172151

0 commit comments

Comments
 (0)