Skip to content

Commit 7a6784b

Browse files
committed
fix normalization condition test error
1 parent eb0eab1 commit 7a6784b

File tree

2 files changed

+94
-88
lines changed

2 files changed

+94
-88
lines changed

reproschema/redcap2reproschema.py

Lines changed: 74 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -88,18 +88,11 @@ def clean_dict_nans(obj):
8888

8989
# TODO: normalized condition should depend on the field type, e.g., for SQL
9090
def normalize_condition(condition_str, field_type=None):
91-
"""
92-
Enhanced normalization of condition strings with specific handling for calc fields.
93-
94-
Args:
95-
condition_str: The condition string to normalize
96-
field_type: The type of field (e.g., 'calc', 'sql')
97-
98-
Returns:
99-
str: Normalized condition string, or None if invalid
100-
"""
91+
"""Normalize condition strings with specific handling for calc fields."""
10192
if condition_str is None or pd.isna(condition_str):
10293
return None
94+
95+
# Handle boolean values
10396
if isinstance(condition_str, bool):
10497
return condition_str
10598
if isinstance(condition_str, str):
@@ -108,43 +101,60 @@ def normalize_condition(condition_str, field_type=None):
108101
if condition_str.lower() == "false":
109102
return False
110103

104+
# Convert to string if needed
111105
if not isinstance(condition_str, str):
112106
try:
113107
condition_str = str(condition_str)
114108
except:
115109
return None
116110

117111
try:
112+
113+
# Clean HTML
118114
condition_str = BeautifulSoup(condition_str, "html.parser").get_text()
115+
condition_str = condition_str.strip()
116+
117+
if not condition_str:
118+
return None
119+
120+
# Common operator normalizations for all types
121+
operator_replacements = [
122+
(r"\s*\+\s*", " + "), # Normalize spacing around +
123+
(r"\s*-\s*", " - "), # Normalize spacing around -
124+
(r"\s*\*\s*", " * "), # Normalize spacing around *
125+
(r"\s*\/\s*", " / "), # Normalize spacing around /
126+
(r"\s*\(\s*", "("), # Remove spaces after opening parenthesis
127+
(r"\s*\)\s*", ")"), # Remove spaces before closing parenthesis
128+
(r"\s*,\s*", ","), # Normalize spaces around commas
129+
(r"\s+", " "), # Normalize multiple spaces
130+
]
119131

120-
# SQL/calc specific handling
132+
# Apply operator normalizations first
133+
for pattern, repl in operator_replacements:
134+
condition_str = re.sub(pattern, repl, condition_str)
135+
136+
# Then apply type-specific replacements
121137
if field_type in ["sql", "calc"]:
122-
# For calc fields, we want to preserve function calls like Math.max
123-
# but normalize the spacing around operators and arguments
124-
replacements = [
125-
(r"\s*\(\s*", "("), # Remove spaces after opening parenthesis
126-
(r"\s*\)\s*", ")"), # Remove spaces before closing parenthesis
127-
(r"\s*,\s*", ", "), # Normalize spaces around commas
128-
(r"\s+", " "), # Normalize multiple spaces to single space
129-
(r'"', "'"), # Standardize quotes
130-
]
138+
# For calc fields, just remove brackets from field references
139+
condition_str = re.sub(r"\[([^\]]+)\]", r"\1", condition_str)
131140
else:
132-
# Standard REDCap logic replacements for non-calc fields
141+
# For branching logic
133142
replacements = [
134143
(r"\(([0-9]*)\)", r"___\1"),
135-
(r"([^>|<])=", r"\1 =="),
136-
(r"\[([^\]]*)\]", r" \1 "),
144+
(r"([^>|<])=", r"\1=="),
145+
(r"\[([^\]]*)\]", r"\1"), # Remove brackets and extra spaces
137146
(r"\bor\b", "||"),
138147
(r"\band\b", "&&"),
139-
(r"\s+", " "),
140-
(r'"', "'"),
148+
(r'"', "'")
141149
]
150+
for pattern, repl in replacements:
151+
condition_str = re.sub(pattern, repl, condition_str)
142152

143-
for pattern, repl in replacements:
144-
condition_str = re.sub(pattern, repl, condition_str)
153+
result = condition_str.strip()
154+
return result
145155

146-
return condition_str.strip() or None
147-
except:
156+
except Exception as e:
157+
print(f"Error normalizing condition: {str(e)}")
148158
return None
149159

150160

@@ -184,12 +194,7 @@ def process_field_properties(data):
184194
... "Branching Logic (Show field only if...)": "[gender] = '1'"
185195
... }
186196
>>> process_field_properties(data)
187-
{
188-
'variableName': 'age',
189-
'isAbout': 'items/age',
190-
'isVis': 'gender == 1',
191-
'valueRequired': True
192-
}
197+
{'variableName': 'age', 'isAbout': 'items/age', 'valueRequired': True, 'isVis': "gender == '1'"}
193198
"""
194199
if not isinstance(data, dict):
195200
return {"variableName": "unknown", "isAbout": "items/unknown"}
@@ -811,9 +816,7 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
811816

812817
try:
813818
df = pd.read_csv(csv_file, encoding="utf-8-sig")
814-
df.columns = df.columns.map(
815-
lambda x: x.strip().strip('"').lstrip("\ufeff")
816-
)
819+
df.columns = df.columns.map(lambda x: x.strip().strip('"').lstrip("\ufeff"))
817820

818821
required_columns = ["Form Name", "Variable / Field Name", "Field Type"]
819822
missing_columns = [
@@ -868,60 +871,49 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
868871
continue
869872

870873
datas[form_name].append(row_dict)
871-
872-
# Always add to order list to preserve sequence
873874
field_path = f"items/{field_name}"
874875

875876
field_type = row_dict.get("Field Type", "").strip().lower()
876877
field_annotation = row_dict.get("Field Annotation", "")
877878

878-
# Add to compute list if needed
879-
if field_type in COMPUTE_LIST and row_dict.get(
880-
"Choices, Calculations, OR Slider Labels"
881-
):
882-
condition = normalize_condition(
883-
row_dict["Choices, Calculations, OR Slider Labels"],
884-
field_type=field_type,
885-
)
886-
if condition:
887-
compute[form_name].append(
888-
{"variableName": field_name, "jsExpression": condition}
889-
)
890-
elif (
891-
field_annotation
892-
and "@CALCTEXT" in str(field_annotation).upper()
893-
):
879+
# Handle compute fields
880+
is_compute = False
881+
882+
# Case 1: Field is calc type
883+
if field_type in COMPUTE_LIST:
884+
calc_value = row_dict.get("Choices, Calculations, OR Slider Labels", "")
885+
if calc_value and str(calc_value).strip():
886+
compute_expression = normalize_condition(calc_value, field_type=field_type)
887+
if compute_expression:
888+
is_compute = True
889+
compute[form_name].append({
890+
"variableName": field_name,
891+
"jsExpression": compute_expression
892+
})
893+
else:
894+
print(f"Warning: Could not normalize calc expression for {field_name}: {calc_value}")
895+
896+
# Case 2: Field has @CALCTEXT
897+
elif field_annotation and "@CALCTEXT" in str(field_annotation).upper():
894898
match = re.search(r"@CALCTEXT\((.*)\)", field_annotation)
895899
if match:
896-
js_expression = normalize_condition(match.group(1))
897-
if js_expression:
898-
compute[form_name].append(
899-
{
900-
"variableName": field_name,
901-
"jsExpression": js_expression,
902-
}
903-
)
904-
else:
905-
order[form_name].append(f"items/{field_name}")
906-
907-
# Validate results
908-
for form_name in datas:
909-
if not datas[form_name]:
910-
print(f"Warning: Form '{form_name}' has no valid fields")
911-
if not order[form_name] and not compute[form_name]:
912-
print(
913-
f"Warning: Form '{form_name}' has no order or compute fields"
914-
)
915-
916-
# Create protocol directory
917-
protocol_dir = Path(abs_folder_path) / protocol_name
918-
protocol_dir.mkdir(parents=True, exist_ok=True)
900+
compute_expression = normalize_condition(match.group(1))
901+
if compute_expression:
902+
is_compute = True
903+
compute[form_name].append({
904+
"variableName": field_name,
905+
"jsExpression": compute_expression
906+
})
907+
908+
# Add to order list only if not a compute field
909+
if not is_compute:
910+
order[form_name].append(field_path)
919911

920912
return datas, order, compute
921913

922-
except pd.errors.EmptyDataError:
923-
raise ValueError("The CSV file is empty")
924-
914+
except Exception as e:
915+
print(f"Error processing CSV: {str(e)}")
916+
raise
925917

926918
# todo adding output path
927919
def redcap2reproschema(

reproschema/tests/test_process_csv.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
import pandas as pd
55
import pytest
66

7-
from ..redcap2reproschema import process_csv
7+
from ..redcap2reproschema import process_csv, normalize_condition
88

99

1010
def test_process_csv():
11-
csv_data = """Form Name,Variable / Field Name,Field Type,Field Annotation,Choices Calculations OR Slider Labels
11+
csv_data = """Form Name,Variable / Field Name,Field Type,Field Annotation,"Choices, Calculations, OR Slider Labels"
1212
form1,field1,text,,
13-
form1,field2,calc,,2+2
13+
form1,field2,calc,,[field1] + [field3]
1414
form1,field3,text,@CALCTEXT(3*3),
1515
form2,field4,text,,
1616
,field5,text,,"""
@@ -25,12 +25,12 @@ def test_process_csv():
2525
assert len(datas["form1"]) == 3
2626
assert len(datas["form2"]) == 1
2727

28-
assert order["form1"] == ["items/field1"] # field3 goes to compute
28+
assert order["form1"] == ["items/field1"] # both field2 and field3 go to compute
2929
assert order["form2"] == ["items/field4"]
3030

3131
assert len(compute["form1"]) == 2
32-
assert compute["form1"][0]["variableName"] == "field2"
33-
assert compute["form1"][1]["variableName"] == "field3"
32+
assert any(item["variableName"] == "field2" for item in compute["form1"])
33+
assert any(item["variableName"] == "field3" for item in compute["form1"])
3434

3535

3636
def test_process_csv_missing_columns():
@@ -41,3 +41,17 @@ def test_process_csv_missing_columns():
4141

4242
with pytest.raises(ValueError):
4343
process_csv(csv_path, tmpdir, "test_protocol")
44+
45+
46+
def test_normalize_condition():
47+
# Test calc expressions
48+
assert normalize_condition("[field1] + [field2]", field_type="calc") == "field1 + field2"
49+
assert normalize_condition("[total]*100", field_type="calc") == "total * 100"
50+
assert normalize_condition("2+2", field_type="calc") == "2 + 2"
51+
52+
# Test @CALCTEXT expressions
53+
assert normalize_condition("3*3") == "3 * 3"
54+
55+
# Test branching logic
56+
assert normalize_condition("[age] = 1") == "age == 1"
57+
assert normalize_condition("[field1] = 1 or [field2] = 2") == "field1 == 1 || field2 == 2"

0 commit comments

Comments
 (0)