Skip to content

Commit eb0eab1

Browse files
committed
merge from remote
2 parents cffaea3 + 3dc96da commit eb0eab1

File tree

5 files changed

+161
-93
lines changed

5 files changed

+161
-93
lines changed

reproschema/redcap2reproschema.py

Lines changed: 91 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
import re
33
from pathlib import Path
44

5+
import numpy as np
56
import pandas as pd
67
import yaml
78
from bs4 import BeautifulSoup
8-
import numpy as np
99

1010
from .context_url import CONTEXTFILE_URL
1111
from .jsonldutils import get_context_version
@@ -85,15 +85,16 @@ def clean_dict_nans(obj):
8585
return obj
8686
return {k: v for k, v in obj.items() if pd.notna(v)}
8787

88+
8889
# TODO: normalized condition should depend on the field type, e.g., for SQL
8990
def normalize_condition(condition_str, field_type=None):
9091
"""
9192
Enhanced normalization of condition strings with specific handling for calc fields.
92-
93+
9394
Args:
9495
condition_str: The condition string to normalize
9596
field_type: The type of field (e.g., 'calc', 'sql')
96-
97+
9798
Returns:
9899
str: Normalized condition string, or None if invalid
99100
"""
@@ -106,7 +107,7 @@ def normalize_condition(condition_str, field_type=None):
106107
return True
107108
if condition_str.lower() == "false":
108109
return False
109-
110+
110111
if not isinstance(condition_str, str):
111112
try:
112113
condition_str = str(condition_str)
@@ -115,17 +116,17 @@ def normalize_condition(condition_str, field_type=None):
115116

116117
try:
117118
condition_str = BeautifulSoup(condition_str, "html.parser").get_text()
118-
119+
119120
# SQL/calc specific handling
120121
if field_type in ["sql", "calc"]:
121122
# For calc fields, we want to preserve function calls like Math.max
122123
# but normalize the spacing around operators and arguments
123124
replacements = [
124-
(r'\s*\(\s*', '('), # Remove spaces after opening parenthesis
125-
(r'\s*\)\s*', ')'), # Remove spaces before closing parenthesis
126-
(r'\s*,\s*', ', '), # Normalize spaces around commas
127-
(r'\s+', ' '), # Normalize multiple spaces to single space
128-
(r'"', "'") # Standardize quotes
125+
(r"\s*\(\s*", "("), # Remove spaces after opening parenthesis
126+
(r"\s*\)\s*", ")"), # Remove spaces before closing parenthesis
127+
(r"\s*,\s*", ", "), # Normalize spaces around commas
128+
(r"\s+", " "), # Normalize multiple spaces to single space
129+
(r'"', "'"), # Standardize quotes
129130
]
130131
else:
131132
# Standard REDCap logic replacements for non-calc fields
@@ -136,16 +137,17 @@ def normalize_condition(condition_str, field_type=None):
136137
(r"\bor\b", "||"),
137138
(r"\band\b", "&&"),
138139
(r"\s+", " "),
139-
(r'"', "'")
140+
(r'"', "'"),
140141
]
141-
142+
142143
for pattern, repl in replacements:
143144
condition_str = re.sub(pattern, repl, condition_str)
144-
145+
145146
return condition_str.strip() or None
146147
except:
147148
return None
148149

150+
149151
def process_field_properties(data):
150152
"""
151153
Process field properties from REDCap data dictionary to create a property object.
@@ -191,12 +193,9 @@ def process_field_properties(data):
191193
"""
192194
if not isinstance(data, dict):
193195
return {"variableName": "unknown", "isAbout": "items/unknown"}
194-
196+
195197
var_name = str(data.get("Variable / Field Name", "unknown")).strip()
196-
prop_obj = {
197-
"variableName": var_name,
198-
"isAbout": f"items/{var_name}"
199-
}
198+
prop_obj = {"variableName": var_name, "isAbout": f"items/{var_name}"}
200199

201200
# Handle required field consistently
202201
if data.get("Required Field?", "").strip().lower() == "y":
@@ -212,7 +211,11 @@ def process_field_properties(data):
212211
# Handle field annotations that affect visibility
213212
annotation = data.get("Field Annotation", "").strip().upper()
214213
if annotation:
215-
if "@HIDDEN" in annotation or "@READONLY" in annotation or "@CALCTEXT" in annotation:
214+
if (
215+
"@HIDDEN" in annotation
216+
or "@READONLY" in annotation
217+
or "@CALCTEXT" in annotation
218+
):
216219
prop_obj["isVis"] = False
217220

218221
field_type = data.get("Field Type", "").strip().lower()
@@ -227,6 +230,7 @@ def process_field_properties(data):
227230

228231
return prop_obj
229232

233+
230234
def parse_field_type_and_value(field):
231235
"""
232236
Parse field type and determine appropriate value type.
@@ -342,14 +346,18 @@ def process_choices(choices_str, field_name):
342346
# Split on first comma only
343347
parts = choice.split(",", 1)
344348
if len(parts) < 2:
345-
print(f"Warning: Invalid choice format '{choice}' in {field_name}")
349+
print(
350+
f"Warning: Invalid choice format '{choice}' in {field_name}"
351+
)
346352
continue
347353

348354
value_part = parts[0].strip()
349355
label_part = parts[1].strip()
350356

351357
if not label_part:
352-
print(f"Warning: Empty label in choice '{choice}' in {field_name}")
358+
print(
359+
f"Warning: Empty label in choice '{choice}' in {field_name}"
360+
)
353361
continue
354362

355363
# Determine value type and convert value
@@ -378,13 +386,17 @@ def process_choices(choices_str, field_name):
378386
# Create choice object
379387
parsed_label = parse_html(label_part)
380388
choice_obj = {
381-
"name": parsed_label if parsed_label else {"en": label_part},
389+
"name": (
390+
parsed_label if parsed_label else {"en": label_part}
391+
),
382392
"value": value,
383393
}
384394
choices.append(choice_obj)
385395

386396
except (ValueError, TypeError) as e:
387-
print(f"Warning: Error processing choice '{choice}' in {field_name}: {str(e)}")
397+
print(
398+
f"Warning: Error processing choice '{choice}' in {field_name}: {str(e)}"
399+
)
388400
continue
389401

390402
if not choices:
@@ -641,21 +653,30 @@ def create_form_schema(
641653
"""
642654
try:
643655
# Validate inputs
644-
if pd.isna(form_name).any() if isinstance(form_name, pd.Series) else pd.isna(form_name):
656+
if (
657+
pd.isna(form_name).any()
658+
if isinstance(form_name, pd.Series)
659+
else pd.isna(form_name)
660+
):
645661
raise ValueError("Form name is required")
646662

647663
# Set default activity display name if not provided
648-
if pd.isna(activity_display_name).any() if isinstance(activity_display_name, pd.Series) else pd.isna(activity_display_name):
664+
if (
665+
pd.isna(activity_display_name).any()
666+
if isinstance(activity_display_name, pd.Series)
667+
else pd.isna(activity_display_name)
668+
):
649669
activity_display_name = str(form_name).replace("_", " ").title()
650670

651671
# Clean and validate order list
652672
clean_order = []
653673
if order is not None:
654674
if isinstance(order, (list, pd.Series, np.ndarray)):
655675
clean_order = [
656-
str(item).strip()
657-
for item in order
658-
if not (isinstance(item, pd.Series) and item.isna().any()) and not pd.isna(item)
676+
str(item).strip()
677+
for item in order
678+
if not (isinstance(item, pd.Series) and item.isna().any())
679+
and not pd.isna(item)
659680
]
660681
clean_order = list(dict.fromkeys(clean_order))
661682

@@ -664,7 +685,8 @@ def create_form_schema(
664685
if bl_list is not None:
665686
if isinstance(bl_list, (list, pd.Series, np.ndarray)):
666687
clean_bl_list = [
667-
prop for prop in bl_list
688+
prop
689+
for prop in bl_list
668690
if prop is not None and isinstance(prop, dict)
669691
]
670692

@@ -686,7 +708,9 @@ def create_form_schema(
686708
if preamble is not None:
687709
if isinstance(preamble, pd.Series):
688710
if not preamble.isna().all():
689-
parsed_preamble = parse_html(preamble.iloc[0] if len(preamble) > 0 else None)
711+
parsed_preamble = parse_html(
712+
preamble.iloc[0] if len(preamble) > 0 else None
713+
)
690714
if parsed_preamble:
691715
json_ld["preamble"] = parsed_preamble
692716
elif not pd.isna(preamble):
@@ -714,7 +738,9 @@ def create_form_schema(
714738
)
715739

716740
except Exception as e:
717-
raise Exception(f"Error creating form schema for {form_name}: {str(e)}")
741+
raise Exception(
742+
f"Error creating form schema for {form_name}: {str(e)}"
743+
)
718744

719745

720746
def process_activities(activity_name, protocol_visibility_obj, protocol_order):
@@ -785,13 +811,19 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
785811

786812
try:
787813
df = pd.read_csv(csv_file, encoding="utf-8-sig")
788-
df.columns = df.columns.map(lambda x: x.strip().strip('"').lstrip("\ufeff"))
814+
df.columns = df.columns.map(
815+
lambda x: x.strip().strip('"').lstrip("\ufeff")
816+
)
789817

790818
required_columns = ["Form Name", "Variable / Field Name", "Field Type"]
791-
missing_columns = [col for col in required_columns if col not in df.columns]
819+
missing_columns = [
820+
col for col in required_columns if col not in df.columns
821+
]
792822
if missing_columns:
793-
raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
794-
823+
raise ValueError(
824+
f"Missing required columns: {', '.join(missing_columns)}"
825+
)
826+
795827
# Initialize structures for each unique form
796828
unique_forms = df["Form Name"].dropna().unique()
797829
if len(unique_forms) == 0:
@@ -844,22 +876,31 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
844876
field_annotation = row_dict.get("Field Annotation", "")
845877

846878
# Add to compute list if needed
847-
if field_type in COMPUTE_LIST and row_dict.get("Choices, Calculations, OR Slider Labels"):
848-
condition = normalize_condition(row_dict["Choices, Calculations, OR Slider Labels"], field_type=field_type)
879+
if field_type in COMPUTE_LIST and row_dict.get(
880+
"Choices, Calculations, OR Slider Labels"
881+
):
882+
condition = normalize_condition(
883+
row_dict["Choices, Calculations, OR Slider Labels"],
884+
field_type=field_type,
885+
)
849886
if condition:
850-
compute[form_name].append({
851-
"variableName": field_name,
852-
"jsExpression": condition
853-
})
854-
elif field_annotation and "@CALCTEXT" in str(field_annotation).upper():
887+
compute[form_name].append(
888+
{"variableName": field_name, "jsExpression": condition}
889+
)
890+
elif (
891+
field_annotation
892+
and "@CALCTEXT" in str(field_annotation).upper()
893+
):
855894
match = re.search(r"@CALCTEXT\((.*)\)", field_annotation)
856895
if match:
857896
js_expression = normalize_condition(match.group(1))
858897
if js_expression:
859-
compute[form_name].append({
860-
"variableName": field_name,
861-
"jsExpression": js_expression
862-
})
898+
compute[form_name].append(
899+
{
900+
"variableName": field_name,
901+
"jsExpression": js_expression,
902+
}
903+
)
863904
else:
864905
order[form_name].append(f"items/{field_name}")
865906

@@ -868,7 +909,9 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
868909
if not datas[form_name]:
869910
print(f"Warning: Form '{form_name}' has no valid fields")
870911
if not order[form_name] and not compute[form_name]:
871-
print(f"Warning: Form '{form_name}' has no order or compute fields")
912+
print(
913+
f"Warning: Form '{form_name}' has no order or compute fields"
914+
)
872915

873916
# Create protocol directory
874917
protocol_dir = Path(abs_folder_path) / protocol_name
@@ -879,6 +922,7 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
879922
except pd.errors.EmptyDataError:
880923
raise ValueError("The CSV file is empty")
881924

925+
882926
# todo adding output path
883927
def redcap2reproschema(
884928
csv_file, yaml_file, output_path, schema_context_url=None

0 commit comments

Comments
 (0)