Skip to content

Commit 9d3e929

Browse files
authored
Merge pull request #33 from yibeichan/master
improved function based on a larger redcap csv (bridge2ai)
2 parents 7e10185 + 891d580 commit 9d3e929

File tree

1 file changed

+85
-24
lines changed

1 file changed

+85
-24
lines changed

reproschema/redcap2reproschema.py

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
import yaml
77
from bs4 import BeautifulSoup
88

9+
matrix_group_count = {}
10+
11+
12+
def clean_header(header):
13+
return {k.lstrip("\ufeff"): v for k, v in header.items()}
14+
915

1016
def normalize_condition(condition_str):
1117
re_parentheses = re.compile(r"\(([0-9]*)\)")
@@ -34,33 +40,60 @@ def process_visibility(data):
3440
return visibility_obj
3541

3642

37-
def parse_field_type_and_value(data, input_type_map):
38-
field_type = data.get("Field Type", "")
39-
43+
def parse_field_type_and_value(field, input_type_map):
44+
field_type = field.get("Field Type", "")
4045
input_type = input_type_map.get(field_type, field_type)
4146

47+
# Initialize the default value type as string
48+
value_type = "xsd:string"
49+
50+
# Map certain field types directly to xsd types
4251
value_type_map = {
43-
"number": "xsd:int",
52+
"text": "xsd:string",
4453
"date_": "xsd:date",
45-
"datetime_": "datetime",
46-
"time_": "xsd:date",
47-
"email": "email",
48-
"phone": "phone",
49-
}
50-
validation_type = data.get("Text Validation Type OR Show Slider Number", "")
51-
52-
value_type = value_type_map.get(validation_type, "xsd:string")
54+
"datetime_": "xsd:dateTime",
55+
"time_": "xsd:time",
56+
"email": "xsd:string",
57+
"phone": "xsd:string",
58+
} # todo: input_type="signature"
59+
60+
# Get the validation type from the field, if available
61+
validation_type = field.get(
62+
"Text Validation Type OR Show Slider Number", ""
63+
).strip()
64+
65+
if validation_type:
66+
# Map the validation type to an XSD type if it's in the map
67+
value_type = value_type_map.get(validation_type, "xsd:string")
68+
elif field_type in ["radio", "dropdown"]:
69+
# If there's no validation type, but the field type is radio or dropdown, use xsd:integer
70+
value_type = "xsd:integer"
5371

5472
return input_type, value_type
5573

5674

57-
def process_choices(choices_str):
75+
def process_choices(field_type, choices_str):
76+
if field_type not in ["radio", "dropdown"]: # Handle only radio and dropdown types
77+
return None
78+
5879
choices = []
5980
for choice in choices_str.split("|"):
6081
parts = choice.split(", ")
61-
choice_obj = {"schema:value": int(parts[0]), "schema:name": parts[1]}
82+
if len(parts) < 2:
83+
print(
84+
f"Warning: Skipping invalid choice format '{choice}' in a {field_type} field"
85+
)
86+
continue
87+
88+
# Try to convert the first part to an integer, if it fails, keep it as a string
89+
try:
90+
value = int(parts[0])
91+
except ValueError:
92+
value = parts[0]
93+
94+
choice_obj = {"name": parts[1], "value": value}
6295
if len(parts) == 3:
63-
# TODO: handle image url
96+
# Handle image url
6497
choice_obj["schema:image"] = f"{parts[2]}.png"
6598
choices.append(choice_obj)
6699
return choices
@@ -90,10 +123,12 @@ def parse_html(input_string, default_language="en"):
90123
text = element.get_text(strip=True)
91124
if text:
92125
result[lang] = text
93-
if not result:
126+
if not result: # If no text was extracted
94127
result[default_language] = soup.get_text(strip=True)
95128
else:
96-
result[default_language] = input_string
129+
result[default_language] = soup.get_text(
130+
strip=True
131+
) # Use the entire text as default language text
97132

98133
return result
99134

@@ -109,9 +144,22 @@ def process_row(
109144
response_list,
110145
additional_notes_list,
111146
):
147+
global matrix_group_count
148+
matrix_group_name = field.get("Matrix Group Name", "")
149+
if matrix_group_name:
150+
matrix_group_count[matrix_group_name] = (
151+
matrix_group_count.get(matrix_group_name, 0) + 1
152+
)
153+
item_id = f"{matrix_group_name}_{matrix_group_count[matrix_group_name]}"
154+
else:
155+
item_id = field.get("Variable / Field Name", "")
156+
112157
rowData = {
113158
"@context": schema_context_url,
114159
"@type": "reproschema:Field",
160+
"@id": item_id,
161+
"prefLabel": item_id,
162+
"description": f"{item_id} of {form_name}",
115163
}
116164

117165
field_type = field.get("Field Type", "")
@@ -124,8 +172,20 @@ def process_row(
124172
if value_type:
125173
rowData["responseOptions"] = {"valueType": value_type}
126174

175+
if field_type == "yesno":
176+
rowData["responseOptions"] = {
177+
"valueType": "xsd:boolean",
178+
"choices": [{"name": "Yes", "value": 1}, {"name": "No", "value": 0}],
179+
}
180+
127181
for key, value in field.items():
128-
if schema_map.get(key) == "allow" and value:
182+
if (
183+
schema_map.get(key) in ["question", "schema:description", "preamble"]
184+
and value
185+
):
186+
rowData.update({schema_map[key]: parse_html(value)})
187+
188+
elif schema_map.get(key) == "allow" and value:
129189
rowData.setdefault("ui", {}).update({schema_map[key]: value.split(", ")})
130190

131191
elif key in ui_list and value:
@@ -139,8 +199,9 @@ def process_row(
139199
rowData.setdefault("responseOptions", {}).update({schema_map[key]: value})
140200

141201
elif schema_map.get(key) == "choices" and value:
202+
# Pass both field_type and value to process_choices
142203
rowData.setdefault("responseOptions", {}).update(
143-
{"choices": process_choices(value)}
204+
{"choices": process_choices(field_type, value)}
144205
)
145206

146207
elif schema_map.get(key) == "scoringLogic" and value:
@@ -159,9 +220,6 @@ def process_row(
159220
{"variableName": field["Variable / Field Name"], "isVis": condition}
160221
)
161222

162-
elif key in ["question", "schema:description", "preamble"] and value:
163-
rowData.update({schema_map[key]: parse_html(value)})
164-
165223
elif key == "Identifier?" and value:
166224
identifier_val = value.lower() == "y"
167225
rowData.update(
@@ -190,6 +248,9 @@ def create_form_schema(
190248
matrix_list,
191249
scores_list,
192250
):
251+
# Use a set to track unique items and preserve order
252+
unique_order = list(dict.fromkeys(order.get(form_name, [])))
253+
193254
# Construct the JSON-LD structure
194255
json_ld = {
195256
"@context": schema_context_url,
@@ -200,7 +261,7 @@ def create_form_schema(
200261
"schemaVersion": "1.0.0-rc4",
201262
"version": "0.0.1",
202263
"ui": {
203-
"order": order.get(form_name, []),
264+
"order": unique_order,
204265
"addProperties": bl_list,
205266
"shuffle": False,
206267
},
@@ -310,6 +371,7 @@ def process_csv(
310371
with open(csv_file, mode="r", encoding="utf-8") as csvfile:
311372
reader = csv.DictReader(csvfile)
312373
for row in reader:
374+
row = clean_header(row)
313375
form_name = row["Form Name"]
314376
if form_name not in datas:
315377
datas[form_name] = []
@@ -484,7 +546,6 @@ def main():
484546
parser.add_argument("yaml_file", help="Path to the Reproschema protocol YAML file.")
485547
args = parser.parse_args()
486548

487-
# Call the main conversion function
488549
redcap2reproschema(args.csv_file, args.yaml_file)
489550

490551

0 commit comments

Comments
 (0)