Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions business_objects/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@
from ..util import prevent_sql_injection


ALL_EMBEDDINGS_WHITELIST = {
"id",
"name",
"custom",
"type",
"state",
"progress",
"dimension",
"count",
"platform",
"model",
"filter_attributes",
"attribute_id",
}
EMBEDDINGS_WHITELIST_COLUMNS_STRING = None


def get(project_id: str, embedding_id: str) -> Embedding:
return (
session.query(Embedding)
Expand Down Expand Up @@ -104,6 +121,62 @@ def get_all_embeddings_by_project_id(project_id: str) -> List[Embedding]:
return session.query(Embedding).filter(Embedding.project_id == project_id).all()


def get_all_embeddings_by_project_id_extended(project_id: str) -> List[Dict[str, Any]]:
project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
query = __get_all_embeddings_by_project_id_extended_query(project_id)
return general.execute_all(query)


def __get_embedding_whitelist_columns_string() -> str:
global EMBEDDINGS_WHITELIST_COLUMNS_STRING
if EMBEDDINGS_WHITELIST_COLUMNS_STRING is None:
EMBEDDINGS_WHITELIST_COLUMNS_STRING = general.construct_select_columns(
Embedding.__tablename__,
prefix="e",
include_columns=ALL_EMBEDDINGS_WHITELIST,
)
return EMBEDDINGS_WHITELIST_COLUMNS_STRING


def __get_all_embeddings_by_project_id_extended_query(project_id: str) -> str:
return f"""
WITH num_recs AS (
SELECT COUNT(r.*) number_records
FROM record r
WHERE r.project_id = '{project_id}'
), e AS (
SELECT
{__get_embedding_whitelist_columns_string()},
nr.number_records,
(SELECT COUNT(et.*) FROM embedding_tensor et WHERE et.embedding_id = e.id) tensor_count
FROM embedding e, num_recs nr
WHERE e.project_id = '{project_id}'
)
SELECT
{__get_embedding_whitelist_columns_string()},
CASE
WHEN e."type" = '{enums.EmbeddingType.ON_ATTRIBUTE.value}' THEN (
SELECT json_array_length(et."data")
FROM embedding_tensor et
WHERE et.embedding_id = e.id
LIMIT 1)
WHEN e."type" = '{enums.EmbeddingType.ON_TOKEN.value}' THEN (
SELECT json_array_length(et."data"->0)
FROM embedding_tensor et
WHERE et.embedding_id = e.id
LIMIT 1)
END dimension,
CASE
WHEN e."state" = '{enums.EmbeddingState.FINISHED.value}' THEN
1.
WHEN e."state" IN ('{enums.EmbeddingState.INITIALIZING.value}', '{enums.EmbeddingState.WAITING.value}') THEN
0.
ELSE LEAST(0.1 + (e.tensor_count / (e.number_records * 0.9)), 0.99)
END progress
FROM e
"""


def get_finished_embeddings(project_id: str) -> List[Embedding]:
return (
session.query(Embedding)
Expand Down
59 changes: 29 additions & 30 deletions business_objects/labeling_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,48 +70,47 @@ def get_task_and_label_by_ids_and_type(
def get_labeling_tasks_by_project_id_full(project_id: str) -> Row:
project_id = prevent_sql_injection(project_id, isinstance(project_id, str))
query = f"""
WITH attribute_select AS (
SELECT id, jsonb_build_object('id',id,'name', NAME,'relative_position', relative_position, 'data_type', data_Type) a_data
WITH attribute_select AS (
SELECT id, a.NAME, relative_position
FROM attribute a
WHERE project_id = '{project_id}'
),
label_select AS (
SELECT labeling_Task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)))) l_data
label_select AS (
SELECT labeling_Task_id, array_agg(jsonb_build_object('id',id,'name', NAME,'color', color, 'hotkey', hotkey)) l_data
FROM labeling_task_label ltl
WHERE project_id = '{project_id}'
GROUP BY 1
),
),
is_select AS (
SELECT labeling_task_id, jsonb_build_object('edges',array_agg(jsonb_build_object('node',jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)))) i_data
SELECT labeling_task_id, array_agg(jsonb_build_object('id',id,'type', type,'return_type', return_type, 'description', description,'name',NAME)) i_data
FROM information_source _is
WHERE project_id = '{project_id}'
GROUP BY 1
)

SELECT
'{project_id}' id,
jsonb_build_object('edges',array_agg(jsonb_build_object('node', lt_data))) labeling_tasks
FROM (
SELECT
jsonb_build_object(
'id',lt.id,
'name', NAME,
'task_target', task_target,
'task_type', task_type,
'attribute',a.a_data,
'labels',COALESCE(l.l_data,jsonb_build_object('edges',ARRAY[]::jsonb[])),
'information_sources',COALESCE(i.i_data,jsonb_build_object('edges',ARRAY[]::jsonb[]))
) lt_data
FROM labeling_task lt
LEFT JOIN attribute_select a
ON lt.attribute_id = a.id
LEFT JOIN label_select l
ON l.labeling_Task_id = lt.id
LEFT JOIN is_select i
ON i.labeling_task_id = lt.id
WHERE project_id = '{project_id}'
) x """
return general.execute_first(query)
SELECT array_agg(
jsonb_build_object(
'id',lt.id,
'name', lt.NAME,
'task_target', task_target,
'task_type', task_type,
'target_id',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.id::TEXT ELSE '' END,
'target_name',CASE WHEN lt.task_target = '{enums.LabelingTaskTarget.ON_ATTRIBUTE.value}' THEN a.name ELSE 'Full Record' END,
'labels',COALESCE(l.l_data,ARRAY[]::jsonb[]),
'information_sources',COALESCE(i.i_data,ARRAY[]::jsonb[])
) ORDER BY a.relative_position, a.name) lt_data
FROM labeling_task lt
LEFT JOIN attribute_select a
ON lt.attribute_id = a.id
LEFT JOIN label_select l
ON l.labeling_Task_id = lt.id
LEFT JOIN is_select i
ON i.labeling_task_id = lt.id
WHERE project_id = '{project_id}' """
values = general.execute_first(query)
if values and values[0]:
return values[0]
return []


def get_task_name_id_dict(project_id: str) -> Dict[str, str]:
Expand Down
75 changes: 41 additions & 34 deletions business_objects/organization.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def get_organization_overview_stats(
values = general.execute_first(
__get_organization_overview_stats_query(organization_id)
)
if values:
if values and values[0]:
return values[0]
return []


def get_user_count(organization_id: str) -> int:
Expand All @@ -69,41 +70,47 @@ def get_user_count(organization_id: str) -> int:
def __get_organization_overview_stats_query(organization_id: str):
return f"""
WITH labeled_records AS (
SELECT project_id, source_type, COUNT(*) source_records
FROM (
SELECT rla.project_id, rla.record_id, rla.source_type
FROM record r
INNER JOIN record_label_association rla
ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}'
INNER JOIN project p
ON rla.project_id = p.id
WHERE p.organization_id = '{organization_id}'
AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}')
GROUP BY rla.project_id, rla.record_id, rla.source_type
) r_reduction
GROUP BY project_id, source_type)

SELECT array_agg(row_to_json(x))
FROM (
SELECT
base.project_id "projectId",
base.base_count "numDataScaleUploaded",
COALESCE(lr_m.source_records,0) "numDataScaleManual",
COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical"
SELECT project_id, source_type, COUNT(*) source_records
FROM (
SELECT r.project_id, COUNT(*) base_count
FROM project p
LEFT JOIN record r
ON r.project_id = p.id
SELECT rla.project_id, rla.record_id, rla.source_type
FROM record r
INNER JOIN record_label_association rla
ON r.project_id = rla.project_id AND r.id = rla.record_id AND r.category = '{enums.RecordCategory.SCALE.value}'
INNER JOIN project p
ON rla.project_id = p.id
WHERE p.organization_id = '{organization_id}'
AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}'
AND r.category = '{enums.RecordCategory.SCALE.value}'
GROUP BY r.project_id
) base
LEFT JOIN labeled_records lr_m
ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}'
LEFT JOIN labeled_records lr_w
ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}' )x
AND rla.source_type IN ('{enums.LabelSource.MANUAL.value}', '{enums.LabelSource.WEAK_SUPERVISION.value}')
GROUP BY rla.project_id, rla.record_id, rla.source_type
) r_reduction
GROUP BY project_id, source_type
) SELECT jsonb_object_agg(x."projectId", row_to_json(x))
FROM (
SELECT
*,
TRIM_SCALE(ROUND(("numDataScaleManual" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "manuallyLabeled",
TRIM_SCALE(ROUND(("numDataScaleProgrammatical" * 100. / "numDataScaleUploaded")::numeric, 2)) || ' %' "weaklySupervised"
FROM (
SELECT
base.project_id "projectId",
base.base_count "numDataScaleUploaded",
COALESCE(lr_m.source_records,0) "numDataScaleManual",
COALESCE(lr_w.source_records,0) "numDataScaleProgrammatical"
FROM (
SELECT r.project_id, COUNT(*) base_count
FROM project p
LEFT JOIN record r
ON r.project_id = p.id
WHERE p.organization_id = '{organization_id}'
AND p."status" != '{enums.ProjectStatus.IN_DELETION.value}'
AND r.category = '{enums.RecordCategory.SCALE.value}'
GROUP BY r.project_id
) base
LEFT JOIN labeled_records lr_m
ON base.project_id = lr_m.project_id AND lr_m.source_type = '{enums.LabelSource.MANUAL.value}'
LEFT JOIN labeled_records lr_w
ON base.project_id = lr_w.project_id AND lr_w.source_type = '{enums.LabelSource.WEAK_SUPERVISION.value}'
) y
) x
"""


Expand Down
35 changes: 3 additions & 32 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Iterable as collections_abc_Iterable
from re import sub, match, compile
import sqlalchemy
import decimal
from uuid import UUID
from datetime import datetime

Expand Down Expand Up @@ -107,38 +108,6 @@ def sql_alchemy_to_dict(
return result


def pack_edges_node(result, name: str, max_lvl: Optional[int] = None):

def convert_value(value, max_lvl: int):
new_lvl = max_lvl - 1 if max_lvl is not None else None
if isinstance(value, list):
return {
"edges": [
{
"node": (
convert_value(item, new_lvl)
if max_lvl is None or max_lvl > 0
else item
)
}
for item in value
]
}
elif isinstance(value, dict):
return {
key: (
convert_value(val, new_lvl)
if max_lvl is None or max_lvl > 0
else val
)
for key, val in value.items()
}
else:
return value

return {"data": {name: convert_value(result, max_lvl)}}


def __sql_alchemy_to_dict(
sql_alchemy_object: Any,
column_whitelist: Optional[Iterable[str]] = None,
Expand Down Expand Up @@ -217,6 +186,8 @@ def to_frontend_obj_raw(value: Union[List, Dict]):
def to_json_serializable(x: Any):
if isinstance(x, datetime):
return x.isoformat()
elif isinstance(x, decimal.Decimal):
return float(x)
elif isinstance(x, UUID):
return str(x)
else:
Expand Down