From 82a25b870c700480b94c0da970bae093800f5373 Mon Sep 17 00:00:00 2001 From: JWittmeyer Date: Tue, 7 Jan 2025 10:19:05 +0100 Subject: [PATCH 1/9] Rework of lt query --- business_objects/labeling_task.py | 59 +++++++++++++++---------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/business_objects/labeling_task.py b/business_objects/labeling_task.py index 69638332..66ddbeaa 100644 --- a/business_objects/labeling_task.py +++ b/business_objects/labeling_task.py @@ -70,48 +70,47 @@ def get_task_and_label_by_ids_and_type( def get_labeling_tasks_by_project_id_full(project_id: str) -> Row: project_id = prevent_sql_injection(project_id, isinstance(project_id, str)) query = f""" - WITH attribute_select AS ( - SELECT id, jsonb_build_object('id',id,'name', NAME,'relative_position', relative_position, 'data_type', data_Type) a_data + WITH attribute_select AS ( + SELECT id, a.NAME, relative_position FROM attribute a WHERE project_id = '{project_id}' ), - label_select AS ( - SELECT labeling_Task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)))) l_data + label_select AS ( + SELECT labeling_Task_id, array_agg(jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)) l_data FROM labeling_task_label ltl WHERE project_id = '{project_id}' GROUP BY 1 - ), + ), is_select AS ( - SELECT labeling_task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)))) i_data + SELECT labeling_task_id, array_agg(jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)) i_data FROM information_source _is WHERE project_id = '{project_id}' GROUP BY 1 ) - SELECT - '{project_id}' id, - jsonb_build_object('edges',array_agg(jsonb_build_object('node', lt_data))) labeling_tasks - FROM ( - SELECT - jsonb_build_object( - 'id',lt.id, - 'name', NAME, - 'task_target', task_target, - 'task_type', task_type, - 'attribute',a.a_data, - 'labels',COALESCE(l.l_data,jsonb_build_object('edges',ARRAY[]::jsonb[])), - 'information_sources',COALESCE(i.i_data,jsonb_build_object('edges',ARRAY[]::jsonb[])) - ) lt_data - FROM labeling_task lt - LEFT JOIN attribute_select a - ON lt.attribute_id = a.id - LEFT JOIN label_select l - ON l.labeling_Task_id = lt.id - LEFT JOIN is_select i - ON i.labeling_task_id = lt.id - WHERE project_id = '{project_id}' - ) x """ - return general.execute_first(query) + SELECT array_agg( + jsonb_build_object( + 'id',lt.id, + 'name', lt.NAME, + 'task_target', task_target, + 'task_type', task_type, + 'target_id',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.id::TEXT ELSE '' END, + 'target_name',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.name ELSE '' END, + 'labels',COALESCE(l.l_data,ARRAY[]::jsonb[]), + 'information_sources',COALESCE(i.i_data,ARRAY[]::jsonb[]) + ) ORDER BY a.relative_position, a.name) lt_data + FROM labeling_task lt + LEFT JOIN attribute_select a + ON lt.attribute_id = a.id + LEFT JOIN label_select l + ON l.labeling_Task_id = lt.id + LEFT JOIN is_select i + ON i.labeling_task_id = lt.id + WHERE project_id = '{project_id}' """ + values = general.execute_first(query) + if values and values[0]: + return values[0] + return [] def get_task_name_id_dict(project_id: str) -> Dict[str, str]: From 09d995f156742f5e6907a0514862d5966f5fc30c Mon Sep 17 00:00:00 2001 From: LennartSchmidtKern Date: Tue, 7 Jan 2025 11:41:43 +0100 Subject: [PATCH 2/9] fix full record --- business_objects/labeling_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/business_objects/labeling_task.py b/business_objects/labeling_task.py index 66ddbeaa..478d95a1 100644 --- a/business_objects/labeling_task.py +++ b/business_objects/labeling_task.py @@ -95,7 +95,7 @@ def get_labeling_tasks_by_project_id_full(project_id: str) -> Row: 'task_target', task_target, 'task_type', task_type, 'target_id',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.id::TEXT ELSE '' END, - 'target_name',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.name ELSE '' END, + 'target_name',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.name ELSE 'Full Record' END, 'labels',COALESCE(l.l_data,ARRAY[]::jsonb[]), 'information_sources',COALESCE(i.i_data,ARRAY[]::jsonb[]) ) ORDER BY a.relative_position, a.name) lt_data From b7df3b00c514729f4777fbbc51b3039647538f87 Mon Sep 17 00:00:00 2001 From: LennartSchmidtKern Date: Tue, 7 Jan 2025 16:27:25 +0100 Subject: [PATCH 3/9] remove edge helpter --- util.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/util.py b/util.py index 7de301ee..f78a9344 100644 --- a/util.py +++ b/util.py @@ -107,38 +107,6 @@ def sql_alchemy_to_dict( return result -def pack_edges_node(result, name: str, max_lvl: Optional[int] = None): - - def convert_value(value, max_lvl: int): - new_lvl = max_lvl - 1 if max_lvl is not None else None - if isinstance(value, list): - return { - "edges": [ - { - "node": ( - convert_value(item, new_lvl) - if max_lvl is None or max_lvl > 0 - else item - ) - } - for item in value - ] - } - elif isinstance(value, dict): - return { - key: ( - convert_value(val, new_lvl) - if max_lvl is None or max_lvl > 0 - else val - ) - for key, val in value.items() - } - else: - return value - - return {"data": {name: convert_value(result, max_lvl)}} - - def __sql_alchemy_to_dict( sql_alchemy_object: Any, column_whitelist: Optional[Iterable[str]] = None, From fe8fc3f834b233cd2ca27e986f4ce61afb1ae9b1 Mon Sep 17 00:00:00 2001 From: LennartSchmidtKern Date: Wed, 8 Jan 2025 15:52:34 +0100 Subject: [PATCH 4/9] org fix --- business_objects/organization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/business_objects/organization.py b/business_objects/organization.py index 7e2e462d..c14e2184 100644 --- a/business_objects/organization.py +++ b/business_objects/organization.py @@ -58,8 +58,9 @@ def get_organization_overview_stats( values = general.execute_first( __get_organization_overview_stats_query(organization_id) ) - if values: + if values and values[0]: return values[0] + return [] def get_user_count(organization_id: str) -> int: From fe6e1d5e783bfb556256bbcae2e549f9b7f718b5 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 9 Jan 2025 13:35:43 +0100 Subject: [PATCH 5/9] perf: update overview-stats query perf: eliminate payload transform in refinery-ui --- business_objects/organization.py | 72 +++++++++++++++++--------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/business_objects/organization.py b/business_objects/organization.py index c14e2184..e20abe7e 100644 --- a/business_objects/organization.py +++ b/business_objects/organization.py @@ -70,41 +70,47 @@ def get_user_count(organization_id: str) -> int: def __get_organization_overview_stats_query(organization_id: str): return f""" WITH labeled_records AS ( - SELECT project_id, source_type, COUNT(*) source_records - FROM ( - SELECT rla.project_id, rla.record_id, rla.source_type - FROM record r - INNER JOIN record_label_association rla - ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}' - INNER JOIN project p - ON rla.project_id = p.id - WHERE p.organization_id = '{organization_id}' - AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}') - GROUP BY rla.project_id, rla.record_id, rla.source_type - ) r_reduction - GROUP BY project_id, source_type) - - SELECT array_agg(row_to_json(x)) - FROM ( - SELECT - base.project_id "projectId", - base.base_count "numDataScaleUploaded", - COALESCE(lr_m.source_records,0) "numDataScaleManual", - COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical" + SELECT project_id, source_type, COUNT(*) source_records FROM ( - SELECT r.project_id, COUNT(*) base_count - FROM project p - LEFT JOIN record r - ON r.project_id = p.id + SELECT rla.project_id, rla.record_id, rla.source_type + FROM record r + INNER JOIN record_label_association rla + ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}' + INNER JOIN project p + ON rla.project_id = p.id WHERE p.organization_id = '{organization_id}' - AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}' - AND r.category = '{enums.RecordCategory.SCALE.value}' - GROUP BY r.project_id - ) base - LEFT JOIN labeled_records lr_m - ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}' - LEFT JOIN labeled_records lr_w - ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}' )x + AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}') + GROUP BY rla.project_id, rla.record_id, rla.source_type + ) r_reduction + GROUP BY project_id, source_type + ) SELECT jsonb_object_agg(x."projectId", row_to_json(x)) + FROM ( + SELECT + *, + TRIM_SCALE(ROUND(("numDataScaleManual" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "manuallyLabeled", + TRIM_SCALE(ROUND(("numDataScaleProgrammatical" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "weaklySupervised" + FROM ( + SELECT + base.project_id "projectId", + base.base_count "numDataScaleUploaded", + COALESCE(lr_m.source_records,0) "numDataScaleManual", + COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical" + FROM ( + SELECT r.project_id, COUNT(*) base_count + FROM project p + LEFT JOIN record r + ON r.project_id = p.id + WHERE p.organization_id = '{organization_id}' + AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}' + AND r.category = '{enums.RecordCategory.SCALE.value}' + GROUP BY r.project_id + ) base + LEFT JOIN labeled_records lr_m + ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}' + LEFT JOIN labeled_records lr_w + ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}' + ) y + ) x """ From da45d1687220ee6b352a486a027ade44f35baeeb Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 9 Jan 2025 15:50:15 +0100 Subject: [PATCH 6/9] perf: add extended embeddings query fix: add float cast on decimal.Decimal data types --- business_objects/embedding.py | 40 +++++++++++++++++++++++++++++++++++ util.py | 3 +++ 2 files changed, 43 insertions(+) diff --git a/business_objects/embedding.py b/business_objects/embedding.py index b9f1c265..13b49b4a 100644 --- a/business_objects/embedding.py +++ b/business_objects/embedding.py @@ -104,6 +104,46 @@ def get_all_embeddings_by_project_id(project_id: str) -> List[Embedding]: return session.query(Embedding).filter(Embedding.project_id == project_id).all() +def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, Any]]: + query = __get_all_embeddings_by_project_id_extended_query(project_id) + return general.execute_all(query) + + +def __get_all_embeddings_by_project_id_extended_query(project_id: str) -> str: + return f""" + WITH embeddings AS ( + SELECT + e.*, + (SELECT COUNT(r.*) FROM record r WHERE r.project_id = '{project_id}') number_records, + (SELECT COUNT(et.*) FROM embedding_tensor et WHERE et.embedding_id = e.id) tensor_count + FROM embedding e + WHERE e.project_id = '{project_id}' + ) + SELECT + embeddings.*, + CASE + WHEN embeddings."type" = '{enums.EmbeddingType.ON_ATTRIBUTE.value}' THEN ( + SELECT COALESCE(json_array_length(et."data"), 0) + FROM embedding_tensor et + WHERE et.embedding_id = embeddings.id + LIMIT 1) + WHEN embeddings."type" = '{enums.EmbeddingType.ON_TOKEN.value}' THEN ( + SELECT COALESCE(json_array_length(et."data"->0), 0) + FROM embedding_tensor et + WHERE et.embedding_id = embeddings.id + LIMIT 1) + END dimension, + CASE + WHEN embeddings."state" = '{enums.EmbeddingState.FINISHED.value}' THEN + 1. + WHEN embeddings."state" IN ('{enums.EmbeddingState.INITIALIZING.value}', '{enums.EmbeddingState.WAITING.value}') THEN + 0. + ELSE LEAST(0.1 + (embeddings.tensor_count / (embeddings.number_records * 0.9)), 0.99) + END progress + FROM embeddings + """ + + def get_finished_embeddings(project_id: str) -> List[Embedding]: return ( session.query(Embedding) diff --git a/util.py b/util.py index f78a9344..c14a7501 100644 --- a/util.py +++ b/util.py @@ -4,6 +4,7 @@ from collections.abc import Iterable as collections_abc_Iterable from re import sub, match, compile import sqlalchemy +import decimal from uuid import UUID from datetime import datetime @@ -185,6 +186,8 @@ def to_frontend_obj_raw(value: Union[List, Dict]): def to_json_serializable(x: Any): if isinstance(x, datetime): return x.isoformat() + elif isinstance(x, decimal.Decimal): + return float(x) elif isinstance(x, UUID): return str(x) else: From 4755d17fd4338c4c044c8c3956df17a59ff56562 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 9 Jan 2025 16:05:44 +0100 Subject: [PATCH 7/9] fix: prevent_sql_injection for project_id --- business_objects/embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/business_objects/embedding.py b/business_objects/embedding.py index 13b49b4a..643b2fb4 100644 --- a/business_objects/embedding.py +++ b/business_objects/embedding.py @@ -105,6 +105,7 @@ def get_all_embeddings_by_project_id(project_id: str) -> List[Embedding]: def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, Any]]: + project_id = prevent_sql_injection(project_id, isinstance(project_id, str)) query = __get_all_embeddings_by_project_id_extended_query(project_id) return general.execute_all(query) From 1f47375025cb8c0e23dbf73c652d0a9eb358dbaf Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 9 Jan 2025 16:55:09 +0100 Subject: [PATCH 8/9] perf: update extended embeddings query perf: add columns whitelist caching logic --- business_objects/embedding.py | 62 ++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/business_objects/embedding.py b/business_objects/embedding.py index 643b2fb4..1f2d78ea 100644 --- a/business_objects/embedding.py +++ b/business_objects/embedding.py @@ -11,6 +11,23 @@ from ..util import prevent_sql_injection +ALL_EMBEDDINGS_WHITELIST = { + "id", + "name", + "custom", + "type", + "state", + "progress", + "dimension", + "count", + "platform", + "model", + "filter_attributes", + "attribute_id", +} +EMBEDDINGS_WHITELIST_COLUMNS_STRING = None + + def get(project_id: str, embedding_id: str) -> Embedding: return ( session.query(Embedding) @@ -110,38 +127,53 @@ def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, return general.execute_all(query) +def __get_embedding_whitelist_columns_string(): + global EMBEDDINGS_WHITELIST_COLUMNS_STRING + if EMBEDDINGS_WHITELIST_COLUMNS_STRING is None: + EMBEDDINGS_WHITELIST_COLUMNS_STRING = general.construct_select_columns( + Embedding.__tablename__, + prefix="e", + include_columns=ALL_EMBEDDINGS_WHITELIST, + ) + return EMBEDDINGS_WHITELIST_COLUMNS_STRING + + def __get_all_embeddings_by_project_id_extended_query(project_id: str) -> str: return f""" - WITH embeddings AS ( + WITH num_recs AS ( + SELECT COUNT(r.*) number_records + FROM record r + WHERE r.project_id = '{project_id}' + ), e AS ( SELECT - e.*, - (SELECT COUNT(r.*) FROM record r WHERE r.project_id = '{project_id}') number_records, + {__get_embedding_whitelist_columns_string()}, + nr.number_records, (SELECT COUNT(et.*) FROM embedding_tensor et WHERE et.embedding_id = e.id) tensor_count - FROM embedding e + FROM embedding e, num_recs nr WHERE e.project_id = '{project_id}' ) SELECT - embeddings.*, + {__get_embedding_whitelist_columns_string()}, CASE - WHEN embeddings."type" = '{enums.EmbeddingType.ON_ATTRIBUTE.value}' THEN ( - SELECT COALESCE(json_array_length(et."data"), 0) + WHEN e."type" = '{enums.EmbeddingType.ON_ATTRIBUTE.value}' THEN ( + SELECT json_array_length(et."data") FROM embedding_tensor et - WHERE et.embedding_id = embeddings.id + WHERE et.embedding_id = e.id LIMIT 1) - WHEN embeddings."type" = '{enums.EmbeddingType.ON_TOKEN.value}' THEN ( - SELECT COALESCE(json_array_length(et."data"->0), 0) + WHEN e."type" = '{enums.EmbeddingType.ON_TOKEN.value}' THEN ( + SELECT json_array_length(et."data"->0) FROM embedding_tensor et - WHERE et.embedding_id = embeddings.id + WHERE et.embedding_id = e.id LIMIT 1) END dimension, CASE - WHEN embeddings."state" = '{enums.EmbeddingState.FINISHED.value}' THEN + WHEN e."state" = '{enums.EmbeddingState.FINISHED.value}' THEN 1. - WHEN embeddings."state" IN ('{enums.EmbeddingState.INITIALIZING.value}', '{enums.EmbeddingState.WAITING.value}') THEN + WHEN e."state" IN ('{enums.EmbeddingState.INITIALIZING.value}', '{enums.EmbeddingState.WAITING.value}') THEN 0. - ELSE LEAST(0.1 + (embeddings.tensor_count / (embeddings.number_records * 0.9)), 0.99) + ELSE LEAST(0.1 + (e.tensor_count / (e.number_records * 0.9)), 0.99) END progress - FROM embeddings + FROM e """ From 9af3f491e5fe351faf343e47e4e42c43b109a948 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Mon, 13 Jan 2025 10:11:20 +0100 Subject: [PATCH 9/9] fix: add typing to new fn --- business_objects/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/business_objects/embedding.py b/business_objects/embedding.py index 1f2d78ea..de02738b 100644 --- a/business_objects/embedding.py +++ b/business_objects/embedding.py @@ -127,7 +127,7 @@ def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, return general.execute_all(query) -def __get_embedding_whitelist_columns_string(): +def __get_embedding_whitelist_columns_string() -> str: global EMBEDDINGS_WHITELIST_COLUMNS_STRING if EMBEDDINGS_WHITELIST_COLUMNS_STRING is None: EMBEDDINGS_WHITELIST_COLUMNS_STRING = general.construct_select_columns(