55from spacy .tokens import DocBin
66
77
8+ def get_check_data_type_function (data_type ):
9+ if data_type == "INTEGER" :
10+ py_data_types = [int ]
11+ elif data_type == "FLOAT" :
12+ py_data_types = [int , float ]
13+ elif data_type == "BOOLEAN" :
14+ py_data_types = [bool ]
15+ elif data_type == "CATEGORY" :
16+ py_data_types = [str ]
17+ elif data_type == "TEXT" :
18+ py_data_types = [str ]
19+ else :
20+ raise ValueError (f"Unknown data type: { data_type } " )
21+ return py_data_types , lambda f : any (
22+ [isinstance (f , py_data_type ) for py_data_type in py_data_types ]
23+ )
24+
25+
826def load_data_dict (record ):
927 if record ["bytes" ][:2 ] == "\\ x" :
1028 record ["bytes" ] = record ["bytes" ][2 :]
@@ -33,7 +51,7 @@ def parse_data_to_record_dict(record_chunk):
3351
3452
3553if __name__ == "__main__" :
36- _ , iso2_code , payload_url = sys .argv
54+ _ , iso2_code , payload_url , data_type = sys .argv
3755
3856 print ("Preparing data for attribute calculation." )
3957
@@ -48,9 +66,18 @@ def parse_data_to_record_dict(record_chunk):
4866
4967 record_dict_list = parse_data_to_record_dict (docbin_data )
5068
69+ py_data_types , check_data_type = get_check_data_type_function (data_type )
70+
5171 print ("Running attribute calculation." )
5272 calculated_attribute_by_record_id = {}
5373 for record_dict in record_dict_list :
74+ attr_value = ac (record_dict ["data" ])
75+ if not check_data_type (attr_value ):
76+ raise ValueError (
77+ f"Attribute value `{ attr_value } ` is of type { type (attr_value )} , "
78+ f"but data_type { data_type } requires "
79+ f"{ str (py_data_types ) if len (py_data_types ) > 1 else str (py_data_types [0 ])} ."
80+ )
5481 calculated_attribute_by_record_id [record_dict ["id" ]] = ac (record_dict ["data" ])
5582
5683 print ("Finished execution." )
0 commit comments