Skip to content

Commit 995e84f

Browse files
authored
Enable Attribute calculation for different data types (#6)
* check data type of calculated attributes * put attribute value inbetween grave accents * pr comment, float attributes can have datatype int or float
1 parent 3470503 commit 995e84f

File tree

3 files changed

+33
-4
lines changed

3 files changed

+33
-4
lines changed

Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
FROM python:3.9-slim
1+
FROM python:3.10-slim
22

33
RUN apt update && apt install -y curl
44

5-
COPY . .
5+
COPY requirements.txt .
66

77
RUN pip3 install -r requirements.txt
88

9+
COPY . .
10+
911
ENTRYPOINT ["/run.sh"]

run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
/usr/bin/curl -s "$1" > docbin_full.json;
44
/usr/bin/curl -s "$2" > attribute_calculators.py;
55

6-
/usr/local/bin/python run_ac.py "$3" "$4";
6+
/usr/local/bin/python -u run_ac.py "$3" "$4" "$5";

run_ac.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,24 @@
55
from spacy.tokens import DocBin
66

77

8+
def get_check_data_type_function(data_type):
9+
if data_type == "INTEGER":
10+
py_data_types = [int]
11+
elif data_type == "FLOAT":
12+
py_data_types = [int, float]
13+
elif data_type == "BOOLEAN":
14+
py_data_types = [bool]
15+
elif data_type == "CATEGORY":
16+
py_data_types = [str]
17+
elif data_type == "TEXT":
18+
py_data_types = [str]
19+
else:
20+
raise ValueError(f"Unknown data type: {data_type}")
21+
return py_data_types, lambda f: any(
22+
[isinstance(f, py_data_type) for py_data_type in py_data_types]
23+
)
24+
25+
826
def load_data_dict(record):
927
if record["bytes"][:2] == "\\x":
1028
record["bytes"] = record["bytes"][2:]
@@ -33,7 +51,7 @@ def parse_data_to_record_dict(record_chunk):
3351

3452

3553
if __name__ == "__main__":
36-
_, iso2_code, payload_url = sys.argv
54+
_, iso2_code, payload_url, data_type = sys.argv
3755

3856
print("Preparing data for attribute calculation.")
3957

@@ -48,9 +66,18 @@ def parse_data_to_record_dict(record_chunk):
4866

4967
record_dict_list = parse_data_to_record_dict(docbin_data)
5068

69+
py_data_types, check_data_type = get_check_data_type_function(data_type)
70+
5171
print("Running attribute calculation.")
5272
calculated_attribute_by_record_id = {}
5373
for record_dict in record_dict_list:
74+
attr_value = ac(record_dict["data"])
75+
if not check_data_type(attr_value):
76+
raise ValueError(
77+
f"Attribute value `{attr_value}` is of type {type(attr_value)}, "
78+
f"but data_type {data_type} requires "
79+
f"{str(py_data_types) if len(py_data_types) > 1 else str(py_data_types[0])}."
80+
)
5481
calculated_attribute_by_record_id[record_dict["id"]] = ac(record_dict["data"])
5582

5683
print("Finished execution.")

0 commit comments

Comments
 (0)