perf: unprototype run_ac.py

andhreljaKern · andhreljaKern · commit e12aea111345 · 2025-01-30T15:02:44.000+01:00
diff --git a/run_ac.py b/run_ac.py
@@ -1,13 +1,14 @@
+from typing import Any, Dict, List, Generator, Callable, Tuple, Type
+import asyncio
 import json
 import requests
 import spacy
 import sys
-import asyncio
 from mustache import prepare_and_render_mustache
 from spacy.tokens import DocBin
 
 
-def get_check_data_type_function(data_type):
+def get_check_data_type_function(data_type: str) -> Tuple[List[Type], Callable]:
     if data_type == "INTEGER":
         return [int], __check_data_type_integer
     elif data_type == "FLOAT":
@@ -26,13 +27,13 @@ def get_check_data_type_function(data_type):
         raise ValueError(f"Unknown data type: {data_type}")
 
 
-def __check_data_type_integer(attr_value):
+def __check_data_type_integer(attr_value: Any) -> bool:
     if attr_value is not None and not isinstance(attr_value, int):
         return False
     return True
 
 
-def __check_data_type_float(attr_value):
+def __check_data_type_float(attr_value: Any) -> bool:
     if (
         attr_value is not None
         and not isinstance(attr_value, float)
@@ -42,27 +43,27 @@ def __check_data_type_float(attr_value):
     return True
 
 
-def __check_data_type_boolean(attr_value):
+def __check_data_type_boolean(attr_value: Any) -> bool:
     if not isinstance(attr_value, bool):
         return False
     return True
 
 
-def __check_data_type_category(attr_value):
+def __check_data_type_category(attr_value: Any) -> bool:
     if not isinstance(attr_value, str):
         return False
     if attr_value == "":
         raise ValueError("Category cannot be empty string")
     return True
 
 
-def __check_data_type_text(attr_value):
+def __check_data_type_text(attr_value: Any) -> bool:
     if not isinstance(attr_value, str):
         return False
     return True
 
 
-def __check_data_type_embedding_list(attr_value):
+def __check_data_type_embedding_list(attr_value: Any) -> bool:
     if not isinstance(attr_value, list):
         return False
     for e in attr_value:
@@ -75,7 +76,9 @@ def __print_progress(progress: float) -> None:
     print(f"progress: {progress}", flush=True)
 
 
-def load_data_dict(record):
+def load_data_dict(record: Dict[str, Any]) -> Dict[str, Any]:
+    global vocab
+
     if record["bytes"][:2] == "\\x":
         record["bytes"] = record["bytes"][2:]
     else:
@@ -95,13 +98,68 @@ def load_data_dict(record):
     return data_dict
 
 
-def parse_data_to_record_dict(record_chunk):
+def parse_data_to_record_dict(
+    record_chunk: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
     result = []
     for r in record_chunk:
         result.append({"id": r["record_id"], "data": load_data_dict(r)})
     return result
 
 
+def save_ac_value(record_id: str, attr_value: Any) -> None:
+    global calculated_attribute_by_record_id, processed_records, progress_size, amount, check_data_type, py_data_types
+
+    if not check_data_type(attr_value):
+        raise ValueError(
+            f"Attribute value `{attr_value}` is of type {type(attr_value)}, "
+            f"but data_type {data_type} requires "
+            f"{str(py_data_types) if len(py_data_types) > 1 else str(py_data_types[0])}."
+        )
+
+    calculated_attribute_by_record_id[record_id] = attr_value
+
+    processed_records = processed_records + 1
+    if processed_records % progress_size == 0:
+        __print_progress(round(processed_records / amount, 2))
+
+
+def process_attribute_calculation(record_dict_list: List[Dict[str, Any]]) -> None:
+    for record_dict in record_dict_list:
+        attr_value: Any = attribute_calculators.ac(record_dict["data"])
+        save_ac_value(record_dict["id"], attr_value)
+
+
+async def process_llm_record_batch(record_dict_batch: List[Dict[str, Any]]) -> None:
+    global DEFAULT_USER_PROMPT_A2VYBG
+
+    for record_dict in record_dict_batch:
+        attribute_calculators.USER_PROMPT_A2VYBG = prepare_and_render_mustache(
+            DEFAULT_USER_PROMPT_A2VYBG, record_dict
+        )
+
+        attr_value: str = await attribute_calculators.ac(record_dict["data"])
+        save_ac_value(record_dict["id"], attr_value)
+
+
+async def process_async_llm_calls(record_dict_list: List[Dict[str, Any]]) -> None:
+    global amount
+
+    def make_batches(
+        iterable: List[Any], size: int = 1
+    ) -> Generator[List[Any], None, None]:
+        length = len(iterable)
+        for ndx in range(0, length, size):
+            yield iterable[ndx : min(ndx + size, length)]
+
+    batch_size = max(amount // int(attribute_calculators.NUM_WORKERS_A2VYBG), 1)
+    tasks = [
+        process_llm_record_batch(batch)
+        for batch in make_batches(record_dict_list, size=batch_size)
+    ]
+    await asyncio.gather(*tasks)
+
+
 if __name__ == "__main__":
     _, iso2_code, payload_url, data_type = sys.argv
 
@@ -111,8 +169,9 @@ def parse_data_to_record_dict(record_chunk):
     # the script `labeling_functions` does not exist. It will be inserted at runtime
     import attribute_calculators
 
-    if data_type == "LLM_RESPONSE":
-        DEFAULT_USER_PROMPT_A2VYBG = attribute_calculators.USER_PROMPT_A2VYBG
+    DEFAULT_USER_PROMPT_A2VYBG = getattr(
+        attribute_calculators, "USER_PROMPT_A2VYBG", None
+    )
 
     vocab = spacy.blank(iso2_code).vocab
 
@@ -125,64 +184,19 @@ def parse_data_to_record_dict(record_chunk):
 
     print("Running attribute calculation.")
     calculated_attribute_by_record_id = {}
-    idx = 0
     amount = len(record_dict_list)
-    progress_size = min(100, amount // 10)
+    progress_size = min(
+        100,
+        max(amount // int(getattr(attribute_calculators, "NUM_WORKERS_A2VYBG", 1)), 1),
+    )
     processed_records = 0
-    __print_progress(processed_records / amount)
-
-    async def process_llm_record_batch(record_dict_batch: list):
-        """Process a batch of record_dicts, writes results into shared var calculated_attribute_by_record_id."""
-
-        for record_dict in record_dict_batch:
-            attribute_calculators.USER_PROMPT_A2VYBG = prepare_and_render_mustache(
-                DEFAULT_USER_PROMPT_A2VYBG, record_dict
-            )
-
-            attr_value: str = await attribute_calculators.ac(record_dict["data"])
-
-            if not check_data_type(attr_value):
-                raise ValueError(
-                    f"Attribute value `{attr_value}` is of type {type(attr_value)}, "
-                    f"but data_type {data_type} requires "
-                    f"{str(py_data_types) if len(py_data_types) > 1 else str(py_data_types[0])}."
-                )
-            calculated_attribute_by_record_id[record_dict["id"]] = attr_value
-            global processed_records
-            processed_records = processed_records + 1
-            if processed_records % progress_size == 0:
-                __print_progress(round(processed_records / amount, 2))
-
-    async def process_async_llm_calls(record_dict_list):
-        batch_size = max(
-            len(record_dict_list) // int(attribute_calculators.NUM_WORKERS_A2VYBG), 1
-        )
-        record_dict_batches = [
-            record_dict_list[i : i + batch_size]
-            for i in range(0, len(record_dict_list), batch_size)
-        ]
-        tasks = [process_llm_record_batch(batch) for batch in record_dict_batches]
-        await asyncio.gather(*tasks)
+    __print_progress(0.0)
 
     if data_type == "LLM_RESPONSE":
         asyncio.run(process_async_llm_calls(record_dict_list))
-        requests.put(payload_url, json=calculated_attribute_by_record_id)
-        __print_progress(1.0)
-        print("Finished execution.")
     else:
-        for record_dict in record_dict_list:
-            idx += 1
-            if idx % progress_size == 0:
-                progress = round(idx / amount, 2)
-                __print_progress(progress)
-            attr_value = attribute_calculators.ac(record_dict["data"])
-            if not check_data_type(attr_value):
-                raise ValueError(
-                    f"Attribute value `{attr_value}` is of type {type(attr_value)}, "
-                    f"but data_type {data_type} requires "
-                    f"{str(py_data_types) if len(py_data_types) > 1 else str(py_data_types[0])}."
-                )
-            calculated_attribute_by_record_id[record_dict["id"]] = attr_value
-        __print_progress(1.0)
-        print("Finished execution.")
-        requests.put(payload_url, json=calculated_attribute_by_record_id)
+        process_attribute_calculation(record_dict_list)
+
+    __print_progress(1.0)
+    print("Finished execution.")
+    requests.put(payload_url, json=calculated_attribute_by_record_id)