diff --git a/business_objects/embedding.py b/business_objects/embedding.py index b9f1c265..de02738b 100644 --- a/business_objects/embedding.py +++ b/business_objects/embedding.py @@ -11,6 +11,23 @@ from ..util import prevent_sql_injection +ALL_EMBEDDINGS_WHITELIST = { + "id", + "name", + "custom", + "type", + "state", + "progress", + "dimension", + "count", + "platform", + "model", + "filter_attributes", + "attribute_id", +} +EMBEDDINGS_WHITELIST_COLUMNS_STRING = None + + def get(project_id: str, embedding_id: str) -> Embedding: return ( session.query(Embedding) @@ -104,6 +121,62 @@ def get_all_embeddings_by_project_id(project_id: str) -> List[Embedding]: return session.query(Embedding).filter(Embedding.project_id == project_id).all() +def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, Any]]: + project_id = prevent_sql_injection(project_id, isinstance(project_id, str)) + query = __get_all_embeddings_by_project_id_extended_query(project_id) + return general.execute_all(query) + + +def __get_embedding_whitelist_columns_string() -> str: + global EMBEDDINGS_WHITELIST_COLUMNS_STRING + if EMBEDDINGS_WHITELIST_COLUMNS_STRING is None: + EMBEDDINGS_WHITELIST_COLUMNS_STRING = general.construct_select_columns( + Embedding.__tablename__, + prefix="e", + include_columns=ALL_EMBEDDINGS_WHITELIST, + ) + return EMBEDDINGS_WHITELIST_COLUMNS_STRING + + +def __get_all_embeddings_by_project_id_extended_query(project_id: str) -> str: + return f""" + WITH num_recs AS ( + SELECT COUNT(r.*) number_records + FROM record r + WHERE r.project_id = '{project_id}' + ), e AS ( + SELECT + {__get_embedding_whitelist_columns_string()}, + nr.number_records, + (SELECT COUNT(et.*) FROM embedding_tensor et WHERE et.embedding_id = e.id) tensor_count + FROM embedding e, num_recs nr + WHERE e.project_id = '{project_id}' + ) + SELECT + {__get_embedding_whitelist_columns_string()}, + CASE + WHEN e."type" = '{enums.EmbeddingType.ON_ATTRIBUTE.value}' THEN ( + SELECT json_array_length(et."data") + FROM embedding_tensor et + WHERE et.embedding_id = e.id + LIMIT 1) + WHEN e."type" = '{enums.EmbeddingType.ON_TOKEN.value}' THEN ( + SELECT json_array_length(et."data"->0) + FROM embedding_tensor et + WHERE et.embedding_id = e.id + LIMIT 1) + END dimension, + CASE + WHEN e."state" = '{enums.EmbeddingState.FINISHED.value}' THEN + 1. + WHEN e."state" IN ('{enums.EmbeddingState.INITIALIZING.value}', '{enums.EmbeddingState.WAITING.value}') THEN + 0. + ELSE LEAST(0.1 + (e.tensor_count / (e.number_records * 0.9)), 0.99) + END progress + FROM e + """ + + def get_finished_embeddings(project_id: str) -> List[Embedding]: return ( session.query(Embedding) diff --git a/business_objects/labeling_task.py b/business_objects/labeling_task.py index 69638332..478d95a1 100644 --- a/business_objects/labeling_task.py +++ b/business_objects/labeling_task.py @@ -70,48 +70,47 @@ def get_task_and_label_by_ids_and_type( def get_labeling_tasks_by_project_id_full(project_id: str) -> Row: project_id = prevent_sql_injection(project_id, isinstance(project_id, str)) query = f""" - WITH attribute_select AS ( - SELECT id, jsonb_build_object('id',id,'name', NAME,'relative_position', relative_position, 'data_type', data_Type) a_data + WITH attribute_select AS ( + SELECT id, a.NAME, relative_position FROM attribute a WHERE project_id = '{project_id}' ), - label_select AS ( - SELECT labeling_Task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)))) l_data + label_select AS ( + SELECT labeling_Task_id, array_agg(jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)) l_data FROM labeling_task_label ltl WHERE project_id = '{project_id}' GROUP BY 1 - ), + ), is_select AS ( - SELECT labeling_task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)))) i_data + SELECT labeling_task_id, array_agg(jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)) i_data FROM information_source _is WHERE project_id = '{project_id}' GROUP BY 1 ) - SELECT - '{project_id}' id, - jsonb_build_object('edges',array_agg(jsonb_build_object('node', lt_data))) labeling_tasks - FROM ( - SELECT - jsonb_build_object( - 'id',lt.id, - 'name', NAME, - 'task_target', task_target, - 'task_type', task_type, - 'attribute',a.a_data, - 'labels',COALESCE(l.l_data,jsonb_build_object('edges',ARRAY[]::jsonb[])), - 'information_sources',COALESCE(i.i_data,jsonb_build_object('edges',ARRAY[]::jsonb[])) - ) lt_data - FROM labeling_task lt - LEFT JOIN attribute_select a - ON lt.attribute_id = a.id - LEFT JOIN label_select l - ON l.labeling_Task_id = lt.id - LEFT JOIN is_select i - ON i.labeling_task_id = lt.id - WHERE project_id = '{project_id}' - ) x """ - return general.execute_first(query) + SELECT array_agg( + jsonb_build_object( + 'id',lt.id, + 'name', lt.NAME, + 'task_target', task_target, + 'task_type', task_type, + 'target_id',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.id::TEXT ELSE '' END, + 'target_name',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.name ELSE 'Full Record' END, + 'labels',COALESCE(l.l_data,ARRAY[]::jsonb[]), + 'information_sources',COALESCE(i.i_data,ARRAY[]::jsonb[]) + ) ORDER BY a.relative_position, a.name) lt_data + FROM labeling_task lt + LEFT JOIN attribute_select a + ON lt.attribute_id = a.id + LEFT JOIN label_select l + ON l.labeling_Task_id = lt.id + LEFT JOIN is_select i + ON i.labeling_task_id = lt.id + WHERE project_id = '{project_id}' """ + values = general.execute_first(query) + if values and values[0]: + return values[0] + return [] def get_task_name_id_dict(project_id: str) -> Dict[str, str]: diff --git a/business_objects/organization.py b/business_objects/organization.py index 7e2e462d..e20abe7e 100644 --- a/business_objects/organization.py +++ b/business_objects/organization.py @@ -58,8 +58,9 @@ def get_organization_overview_stats( values = general.execute_first( __get_organization_overview_stats_query(organization_id) ) - if values: + if values and values[0]: return values[0] + return [] def get_user_count(organization_id: str) -> int: @@ -69,41 +70,47 @@ def get_user_count(organization_id: str) -> int: def __get_organization_overview_stats_query(organization_id: str): return f""" WITH labeled_records AS ( - SELECT project_id, source_type, COUNT(*) source_records - FROM ( - SELECT rla.project_id, rla.record_id, rla.source_type - FROM record r - INNER JOIN record_label_association rla - ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}' - INNER JOIN project p - ON rla.project_id = p.id - WHERE p.organization_id = '{organization_id}' - AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}') - GROUP BY rla.project_id, rla.record_id, rla.source_type - ) r_reduction - GROUP BY project_id, source_type) - - SELECT array_agg(row_to_json(x)) - FROM ( - SELECT - base.project_id "projectId", - base.base_count "numDataScaleUploaded", - COALESCE(lr_m.source_records,0) "numDataScaleManual", - COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical" + SELECT project_id, source_type, COUNT(*) source_records FROM ( - SELECT r.project_id, COUNT(*) base_count - FROM project p - LEFT JOIN record r - ON r.project_id = p.id + SELECT rla.project_id, rla.record_id, rla.source_type + FROM record r + INNER JOIN record_label_association rla + ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}' + INNER JOIN project p + ON rla.project_id = p.id WHERE p.organization_id = '{organization_id}' - AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}' - AND r.category = '{enums.RecordCategory.SCALE.value}' - GROUP BY r.project_id - ) base - LEFT JOIN labeled_records lr_m - ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}' - LEFT JOIN labeled_records lr_w - ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}' )x + AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}') + GROUP BY rla.project_id, rla.record_id, rla.source_type + ) r_reduction + GROUP BY project_id, source_type + ) SELECT jsonb_object_agg(x."projectId", row_to_json(x)) + FROM ( + SELECT + *, + TRIM_SCALE(ROUND(("numDataScaleManual" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "manuallyLabeled", + TRIM_SCALE(ROUND(("numDataScaleProgrammatical" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "weaklySupervised" + FROM ( + SELECT + base.project_id "projectId", + base.base_count "numDataScaleUploaded", + COALESCE(lr_m.source_records,0) "numDataScaleManual", + COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical" + FROM ( + SELECT r.project_id, COUNT(*) base_count + FROM project p + LEFT JOIN record r + ON r.project_id = p.id + WHERE p.organization_id = '{organization_id}' + AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}' + AND r.category = '{enums.RecordCategory.SCALE.value}' + GROUP BY r.project_id + ) base + LEFT JOIN labeled_records lr_m + ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}' + LEFT JOIN labeled_records lr_w + ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}' + ) y + ) x """ diff --git a/util.py b/util.py index 7de301ee..c14a7501 100644 --- a/util.py +++ b/util.py @@ -4,6 +4,7 @@ from collections.abc import Iterable as collections_abc_Iterable from re import sub, match, compile import sqlalchemy +import decimal from uuid import UUID from datetime import datetime @@ -107,38 +108,6 @@ def sql_alchemy_to_dict( return result -def pack_edges_node(result, name: str, max_lvl: Optional[int] = None): - - def convert_value(value, max_lvl: int): - new_lvl = max_lvl - 1 if max_lvl is not None else None - if isinstance(value, list): - return { - "edges": [ - { - "node": ( - convert_value(item, new_lvl) - if max_lvl is None or max_lvl > 0 - else item - ) - } - for item in value - ] - } - elif isinstance(value, dict): - return { - key: ( - convert_value(val, new_lvl) - if max_lvl is None or max_lvl > 0 - else val - ) - for key, val in value.items() - } - else: - return value - - return {"data": {name: convert_value(result, max_lvl)}} - - def __sql_alchemy_to_dict( sql_alchemy_object: Any, column_whitelist: Optional[Iterable[str]] = None, @@ -217,6 +186,8 @@ def to_frontend_obj_raw(value: Union[List, Dict]): def to_json_serializable(x: Any): if isinstance(x, datetime): return x.isoformat() + elif isinstance(x, decimal.Decimal): + return float(x) elif isinstance(x, UUID): return str(x) else: