22import re
33from pathlib import Path
44
5+ import numpy as np
56import pandas as pd
67import yaml
78from bs4 import BeautifulSoup
8- import numpy as np
99
1010from .context_url import CONTEXTFILE_URL
1111from .jsonldutils import get_context_version
@@ -85,15 +85,16 @@ def clean_dict_nans(obj):
8585 return obj
8686 return {k : v for k , v in obj .items () if pd .notna (v )}
8787
88+
8889# TODO: normalized condition should depend on the field type, e.g., for SQL
8990def normalize_condition (condition_str , field_type = None ):
9091 """
9192 Enhanced normalization of condition strings with specific handling for calc fields.
92-
93+
9394 Args:
9495 condition_str: The condition string to normalize
9596 field_type: The type of field (e.g., 'calc', 'sql')
96-
97+
9798 Returns:
9899 str: Normalized condition string, or None if invalid
99100 """
@@ -106,7 +107,7 @@ def normalize_condition(condition_str, field_type=None):
106107 return True
107108 if condition_str .lower () == "false" :
108109 return False
109-
110+
110111 if not isinstance (condition_str , str ):
111112 try :
112113 condition_str = str (condition_str )
@@ -115,17 +116,17 @@ def normalize_condition(condition_str, field_type=None):
115116
116117 try :
117118 condition_str = BeautifulSoup (condition_str , "html.parser" ).get_text ()
118-
119+
119120 # SQL/calc specific handling
120121 if field_type in ["sql" , "calc" ]:
121122 # For calc fields, we want to preserve function calls like Math.max
122123 # but normalize the spacing around operators and arguments
123124 replacements = [
124- (r' \s*\(\s*' , '(' ), # Remove spaces after opening parenthesis
125- (r' \s*\)\s*' , ')' ), # Remove spaces before closing parenthesis
126- (r' \s*,\s*' , ', ' ), # Normalize spaces around commas
127- (r' \s+' , ' ' ), # Normalize multiple spaces to single space
128- (r'"' , "'" ) # Standardize quotes
125+ (r" \s*\(\s*" , "(" ), # Remove spaces after opening parenthesis
126+ (r" \s*\)\s*" , ")" ), # Remove spaces before closing parenthesis
127+ (r" \s*,\s*" , ", " ), # Normalize spaces around commas
128+ (r" \s+" , " " ), # Normalize multiple spaces to single space
129+ (r'"' , "'" ), # Standardize quotes
129130 ]
130131 else :
131132 # Standard REDCap logic replacements for non-calc fields
@@ -136,16 +137,17 @@ def normalize_condition(condition_str, field_type=None):
136137 (r"\bor\b" , "||" ),
137138 (r"\band\b" , "&&" ),
138139 (r"\s+" , " " ),
139- (r'"' , "'" )
140+ (r'"' , "'" ),
140141 ]
141-
142+
142143 for pattern , repl in replacements :
143144 condition_str = re .sub (pattern , repl , condition_str )
144-
145+
145146 return condition_str .strip () or None
146147 except :
147148 return None
148149
150+
149151def process_field_properties (data ):
150152 """
151153 Process field properties from REDCap data dictionary to create a property object.
@@ -191,12 +193,9 @@ def process_field_properties(data):
191193 """
192194 if not isinstance (data , dict ):
193195 return {"variableName" : "unknown" , "isAbout" : "items/unknown" }
194-
196+
195197 var_name = str (data .get ("Variable / Field Name" , "unknown" )).strip ()
196- prop_obj = {
197- "variableName" : var_name ,
198- "isAbout" : f"items/{ var_name } "
199- }
198+ prop_obj = {"variableName" : var_name , "isAbout" : f"items/{ var_name } " }
200199
201200 # Handle required field consistently
202201 if data .get ("Required Field?" , "" ).strip ().lower () == "y" :
@@ -212,7 +211,11 @@ def process_field_properties(data):
212211 # Handle field annotations that affect visibility
213212 annotation = data .get ("Field Annotation" , "" ).strip ().upper ()
214213 if annotation :
215- if "@HIDDEN" in annotation or "@READONLY" in annotation or "@CALCTEXT" in annotation :
214+ if (
215+ "@HIDDEN" in annotation
216+ or "@READONLY" in annotation
217+ or "@CALCTEXT" in annotation
218+ ):
216219 prop_obj ["isVis" ] = False
217220
218221 field_type = data .get ("Field Type" , "" ).strip ().lower ()
@@ -227,6 +230,7 @@ def process_field_properties(data):
227230
228231 return prop_obj
229232
233+
230234def parse_field_type_and_value (field ):
231235 """
232236 Parse field type and determine appropriate value type.
@@ -342,14 +346,18 @@ def process_choices(choices_str, field_name):
342346 # Split on first comma only
343347 parts = choice .split ("," , 1 )
344348 if len (parts ) < 2 :
345- print (f"Warning: Invalid choice format '{ choice } ' in { field_name } " )
349+ print (
350+ f"Warning: Invalid choice format '{ choice } ' in { field_name } "
351+ )
346352 continue
347353
348354 value_part = parts [0 ].strip ()
349355 label_part = parts [1 ].strip ()
350356
351357 if not label_part :
352- print (f"Warning: Empty label in choice '{ choice } ' in { field_name } " )
358+ print (
359+ f"Warning: Empty label in choice '{ choice } ' in { field_name } "
360+ )
353361 continue
354362
355363 # Determine value type and convert value
@@ -378,13 +386,17 @@ def process_choices(choices_str, field_name):
378386 # Create choice object
379387 parsed_label = parse_html (label_part )
380388 choice_obj = {
381- "name" : parsed_label if parsed_label else {"en" : label_part },
389+ "name" : (
390+ parsed_label if parsed_label else {"en" : label_part }
391+ ),
382392 "value" : value ,
383393 }
384394 choices .append (choice_obj )
385395
386396 except (ValueError , TypeError ) as e :
387- print (f"Warning: Error processing choice '{ choice } ' in { field_name } : { str (e )} " )
397+ print (
398+ f"Warning: Error processing choice '{ choice } ' in { field_name } : { str (e )} "
399+ )
388400 continue
389401
390402 if not choices :
@@ -641,21 +653,30 @@ def create_form_schema(
641653 """
642654 try :
643655 # Validate inputs
644- if pd .isna (form_name ).any () if isinstance (form_name , pd .Series ) else pd .isna (form_name ):
656+ if (
657+ pd .isna (form_name ).any ()
658+ if isinstance (form_name , pd .Series )
659+ else pd .isna (form_name )
660+ ):
645661 raise ValueError ("Form name is required" )
646662
647663 # Set default activity display name if not provided
648- if pd .isna (activity_display_name ).any () if isinstance (activity_display_name , pd .Series ) else pd .isna (activity_display_name ):
664+ if (
665+ pd .isna (activity_display_name ).any ()
666+ if isinstance (activity_display_name , pd .Series )
667+ else pd .isna (activity_display_name )
668+ ):
649669 activity_display_name = str (form_name ).replace ("_" , " " ).title ()
650670
651671 # Clean and validate order list
652672 clean_order = []
653673 if order is not None :
654674 if isinstance (order , (list , pd .Series , np .ndarray )):
655675 clean_order = [
656- str (item ).strip ()
657- for item in order
658- if not (isinstance (item , pd .Series ) and item .isna ().any ()) and not pd .isna (item )
676+ str (item ).strip ()
677+ for item in order
678+ if not (isinstance (item , pd .Series ) and item .isna ().any ())
679+ and not pd .isna (item )
659680 ]
660681 clean_order = list (dict .fromkeys (clean_order ))
661682
@@ -664,7 +685,8 @@ def create_form_schema(
664685 if bl_list is not None :
665686 if isinstance (bl_list , (list , pd .Series , np .ndarray )):
666687 clean_bl_list = [
667- prop for prop in bl_list
688+ prop
689+ for prop in bl_list
668690 if prop is not None and isinstance (prop , dict )
669691 ]
670692
@@ -686,7 +708,9 @@ def create_form_schema(
686708 if preamble is not None :
687709 if isinstance (preamble , pd .Series ):
688710 if not preamble .isna ().all ():
689- parsed_preamble = parse_html (preamble .iloc [0 ] if len (preamble ) > 0 else None )
711+ parsed_preamble = parse_html (
712+ preamble .iloc [0 ] if len (preamble ) > 0 else None
713+ )
690714 if parsed_preamble :
691715 json_ld ["preamble" ] = parsed_preamble
692716 elif not pd .isna (preamble ):
@@ -714,7 +738,9 @@ def create_form_schema(
714738 )
715739
716740 except Exception as e :
717- raise Exception (f"Error creating form schema for { form_name } : { str (e )} " )
741+ raise Exception (
742+ f"Error creating form schema for { form_name } : { str (e )} "
743+ )
718744
719745
720746def process_activities (activity_name , protocol_visibility_obj , protocol_order ):
@@ -785,13 +811,19 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
785811
786812 try :
787813 df = pd .read_csv (csv_file , encoding = "utf-8-sig" )
788- df .columns = df .columns .map (lambda x : x .strip ().strip ('"' ).lstrip ("\ufeff " ))
814+ df .columns = df .columns .map (
815+ lambda x : x .strip ().strip ('"' ).lstrip ("\ufeff " )
816+ )
789817
790818 required_columns = ["Form Name" , "Variable / Field Name" , "Field Type" ]
791- missing_columns = [col for col in required_columns if col not in df .columns ]
819+ missing_columns = [
820+ col for col in required_columns if col not in df .columns
821+ ]
792822 if missing_columns :
793- raise ValueError (f"Missing required columns: { ', ' .join (missing_columns )} " )
794-
823+ raise ValueError (
824+ f"Missing required columns: { ', ' .join (missing_columns )} "
825+ )
826+
795827 # Initialize structures for each unique form
796828 unique_forms = df ["Form Name" ].dropna ().unique ()
797829 if len (unique_forms ) == 0 :
@@ -844,22 +876,31 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
844876 field_annotation = row_dict .get ("Field Annotation" , "" )
845877
846878 # Add to compute list if needed
847- if field_type in COMPUTE_LIST and row_dict .get ("Choices, Calculations, OR Slider Labels" ):
848- condition = normalize_condition (row_dict ["Choices, Calculations, OR Slider Labels" ], field_type = field_type )
879+ if field_type in COMPUTE_LIST and row_dict .get (
880+ "Choices, Calculations, OR Slider Labels"
881+ ):
882+ condition = normalize_condition (
883+ row_dict ["Choices, Calculations, OR Slider Labels" ],
884+ field_type = field_type ,
885+ )
849886 if condition :
850- compute [form_name ].append ({
851- "variableName" : field_name ,
852- "jsExpression" : condition
853- })
854- elif field_annotation and "@CALCTEXT" in str (field_annotation ).upper ():
887+ compute [form_name ].append (
888+ {"variableName" : field_name , "jsExpression" : condition }
889+ )
890+ elif (
891+ field_annotation
892+ and "@CALCTEXT" in str (field_annotation ).upper ()
893+ ):
855894 match = re .search (r"@CALCTEXT\((.*)\)" , field_annotation )
856895 if match :
857896 js_expression = normalize_condition (match .group (1 ))
858897 if js_expression :
859- compute [form_name ].append ({
860- "variableName" : field_name ,
861- "jsExpression" : js_expression
862- })
898+ compute [form_name ].append (
899+ {
900+ "variableName" : field_name ,
901+ "jsExpression" : js_expression ,
902+ }
903+ )
863904 else :
864905 order [form_name ].append (f"items/{ field_name } " )
865906
@@ -868,7 +909,9 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
868909 if not datas [form_name ]:
869910 print (f"Warning: Form '{ form_name } ' has no valid fields" )
870911 if not order [form_name ] and not compute [form_name ]:
871- print (f"Warning: Form '{ form_name } ' has no order or compute fields" )
912+ print (
913+ f"Warning: Form '{ form_name } ' has no order or compute fields"
914+ )
872915
873916 # Create protocol directory
874917 protocol_dir = Path (abs_folder_path ) / protocol_name
@@ -879,6 +922,7 @@ def process_csv(csv_file, abs_folder_path, protocol_name):
879922 except pd .errors .EmptyDataError :
880923 raise ValueError ("The CSV file is empty" )
881924
925+
882926# todo adding output path
883927def redcap2reproschema (
884928 csv_file , yaml_file , output_path , schema_context_url = None
0 commit comments