Skip to content

Commit e3a40cf

Browse files
Merge pull request #42 from DataKitchen/release/4.1.1
Release/4.1.1
2 parents 9550f69 + 70fd2f6 commit e3a40cf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+2811
-955
lines changed

deploy/testgen-base.dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ RUN apk update && apk upgrade && apk add --no-cache \
2323
openblas=0.3.28-r0 \
2424
openblas-dev=0.3.28-r0 \
2525
unixodbc=2.3.12-r0 \
26-
unixodbc-dev=2.3.12-r0
26+
unixodbc-dev=2.3.12-r0 \
27+
# Pinned versions for security
28+
xz=5.6.2-r1
2729

2830
RUN apk add --no-cache \
2931
--repository https://dl-cdn.alpinelinux.org/alpine/v3.21/community \

deploy/testgen.dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG TESTGEN_BASE_LABEL=v5
1+
ARG TESTGEN_BASE_LABEL=v6
22

33
FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image
44

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
88

99
[project]
1010
name = "dataops-testgen"
11-
version = "4.0.12"
11+
version = "4.1.2"
1212
description = "DataKitchen's Data Quality DataOps TestGen"
1313
authors = [
1414
{ "name" = "DataKitchen, Inc.", "email" = "[email protected]" },
@@ -64,7 +64,6 @@ dependencies = [
6464
"snowflake-connector-python==3.13.1",
6565
"matplotlib==3.9.2",
6666
"scipy==1.14.1",
67-
"tornado==6.4.2",
6867
"jinja2==3.1.6",
6968
]
7069

testgen/common/models/scores.py

Lines changed: 150 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import enum
22
import uuid
3-
from collections import defaultdict
43
from collections.abc import Iterable
54
from datetime import UTC, datetime
5+
from itertools import groupby
66
from typing import Literal, Self, TypedDict
77

88
import pandas as pd
@@ -69,15 +69,17 @@ class ScoreDefinition(Base):
6969
cde_score: bool = Column(Boolean, default=False, nullable=False)
7070
category: ScoreCategory | None = Column(Enum(ScoreCategory), nullable=True)
7171

72-
results: Iterable["ScoreDefinitionResult"] = relationship(
73-
"ScoreDefinitionResult",
72+
criteria: "ScoreDefinitionCriteria" = relationship(
73+
"ScoreDefinitionCriteria",
7474
cascade="all, delete-orphan",
75-
order_by="ScoreDefinitionResult.category",
7675
lazy="joined",
76+
uselist=False,
77+
single_parent=True,
7778
)
78-
filters: Iterable["ScoreDefinitionFilter"] = relationship(
79-
"ScoreDefinitionFilter",
79+
results: Iterable["ScoreDefinitionResult"] = relationship(
80+
"ScoreDefinitionResult",
8081
cascade="all, delete-orphan",
82+
order_by="ScoreDefinitionResult.category",
8183
lazy="joined",
8284
)
8385
breakdown: Iterable["ScoreDefinitionBreakdownItem"] = relationship(
@@ -102,9 +104,12 @@ def from_table_group(cls, table_group: dict) -> Self:
102104
definition.total_score = True
103105
definition.cde_score = True
104106
definition.category = ScoreCategory.dq_dimension
105-
definition.filters = [
106-
ScoreDefinitionFilter(field="table_groups_name", value=table_group["table_groups_name"]),
107-
]
107+
definition.criteria = ScoreDefinitionCriteria(
108+
operand="AND",
109+
filters=[
110+
ScoreDefinitionFilter(field="table_groups_name", value=table_group["table_groups_name"]),
111+
],
112+
)
108113
return definition
109114

110115
@classmethod
@@ -159,7 +164,7 @@ def as_score_card(self) -> "ScoreCard":
159164
score_cards/get_category_scores_by_column.sql
160165
score_cards/get_category_scores_by_dimension.sql
161166
"""
162-
if len(self.filters) <= 0:
167+
if not self.criteria.has_filters():
163168
return {
164169
"id": self.id,
165170
"project_code": self.project_code,
@@ -378,15 +383,15 @@ def recalculate_scores_history(self) -> None:
378383
self.history = list(current_history.values())
379384

380385
def _get_raw_query_filters(self, cde_only: bool = False, prefix: str | None = None) -> list[str]:
381-
values_by_field = defaultdict(list)
382-
for filter_ in self.filters:
383-
values_by_field[filter_.field].append(f"'{filter_.value}'")
384-
values_by_field["project_code"].append(f"'{self.project_code}'")
386+
extra_filters = [
387+
f"{prefix or ''}project_code = '{self.project_code}'"
388+
]
385389
if cde_only:
386-
values_by_field["critical_data_element"].append("true")
390+
extra_filters.append(f"{prefix or ''}critical_data_element = true")
387391

388392
return [
389-
f"{prefix or ''}{field} IN ({', '.join(values)})" for field, values in values_by_field.items()
393+
*extra_filters,
394+
self.criteria.get_as_sql(prefix=prefix),
390395
]
391396

392397
def to_dict(self) -> dict:
@@ -397,17 +402,144 @@ def to_dict(self) -> dict:
397402
"total_score": self.total_score,
398403
"cde_score": self.cde_score,
399404
"category": self.category.value if self.category else None,
400-
"filters": [{"field": f.field, "value": f.value} for f in self.filters],
405+
"filters": list(self.criteria),
406+
"filter_by_columns": (not self.criteria.group_by_field)
407+
if self.criteria.group_by_field is not None else None,
401408
}
402409

403410

411+
class ScoreDefinitionCriteria(Base):
412+
"""
413+
Hold the filter conditions applied for a given scorecard.
414+
415+
Properties are as follow:
416+
417+
:param operand: boolean operand to join the final filters
418+
419+
Either `AND` or `OR`. The operand is used to join the filters
420+
after they have been individually processed, grouped and
421+
formatted into valid SQL expressions.
422+
423+
:param group_by_field: boolean to group filters by field name
424+
425+
Boolean indicating that filters to same field must be combined
426+
to produce the intermediary filters that will later be joined
427+
with :property:`operand`.
428+
429+
When false, filters are individually converted to valid SQL and
430+
then joined with :property:`operand`.
431+
432+
When true, filters are sorted and grouped by field name, all
433+
filters for a given field name are combined with an `OR` boolean
434+
condition into a single filter. Then, the resulting filters
435+
are joined with :property:`operand`.
436+
437+
:param filters: a list of :class:`ScoreDefinitionFilter` objects
438+
"""
439+
440+
__tablename__ = "score_definition_criteria"
441+
442+
id: str = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
443+
definition_id: str = Column(UUID(as_uuid=True), ForeignKey("score_definitions.id", ondelete="CASCADE"))
444+
operand: Literal["AND", "OR"] = Column(String, nullable=False, default="AND")
445+
group_by_field: bool = Column(Boolean, nullable=False, default=True)
446+
filters: list["ScoreDefinitionFilter"] = relationship(
447+
"ScoreDefinitionFilter",
448+
cascade="all, delete-orphan",
449+
lazy="joined",
450+
)
451+
452+
def __str__(self):
453+
return self.get_as_sql()
454+
455+
def get_as_sql(
456+
self,
457+
prefix: str | None = None,
458+
) -> str | None:
459+
if len(self.filters) > 0:
460+
if self.group_by_field:
461+
filters_sql = []
462+
grouped_filters = groupby(sorted(self.filters, key=lambda f: f.field), key=lambda f: f.field)
463+
for _, field_filters in grouped_filters:
464+
field_filters_sql = [f.get_as_sql(prefix=prefix, operand="AND") for f in field_filters]
465+
filters_sql.append(
466+
f"({" OR ".join(field_filters_sql)})" if len(field_filters_sql) > 1 else field_filters_sql[0]
467+
)
468+
else:
469+
filters_sql = [ f.get_as_sql(prefix=prefix, operand="AND") for f in self.filters ]
470+
return f"({f' {self.operand} '.join(filters_sql)})" if len(filters_sql) > 1 else filters_sql[0]
471+
return None
472+
473+
def __iter__(self):
474+
for filter_ in self.filters:
475+
yield {
476+
"field": filter_.field,
477+
"value": filter_.value,
478+
"others": [
479+
{"field": linked_filter.field, "value": linked_filter.value}
480+
for linked_filter in filter_.next_filter
481+
] if filter_.next_filter else [],
482+
}
483+
484+
def has_filters(self) -> bool:
485+
return len(self.filters) > 0
486+
487+
@classmethod
488+
def from_filters(cls, filters: list[dict], group_by_field: bool = True) -> "ScoreDefinitionCriteria":
489+
chained_filters: list[ScoreDefinitionFilter] = []
490+
for filter_ in filters:
491+
root_filter = current_filter = ScoreDefinitionFilter(
492+
field=filter_["field"],
493+
value=filter_["value"],
494+
next_filter=None,
495+
)
496+
for linked_filter in (filter_.get("others") or []):
497+
current_filter.next_filter = ScoreDefinitionFilter(
498+
field=linked_filter["field"],
499+
value=linked_filter["value"],
500+
next_filter=None,
501+
)
502+
current_filter = current_filter.next_filter
503+
chained_filters.append(root_filter)
504+
return cls(operand="AND" if group_by_field else "OR", filters=chained_filters, group_by_field=group_by_field)
505+
506+
404507
class ScoreDefinitionFilter(Base):
405508
__tablename__ = "score_definition_filters"
406509

407510
id: str = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
408-
definition_id: str = Column(UUID(as_uuid=True), ForeignKey("score_definitions.id", ondelete="CASCADE"))
511+
criteria_id = Column(
512+
UUID(as_uuid=True),
513+
ForeignKey("score_definition_criteria.id", ondelete="CASCADE"),
514+
nullable=True,
515+
default=None,
516+
)
409517
field: str = Column(String, nullable=False)
410518
value: str = Column(String, nullable=False)
519+
next_filter_id = Column(
520+
UUID(as_uuid=True),
521+
ForeignKey("score_definition_filters.id", ondelete="CASCADE"),
522+
nullable=True,
523+
default=None,
524+
)
525+
next_filter: "ScoreDefinitionFilter" = relationship(
526+
"ScoreDefinitionFilter",
527+
cascade="all, delete-orphan",
528+
lazy="joined",
529+
uselist=False,
530+
single_parent=True,
531+
)
532+
533+
def __iter__(self):
534+
current_filter = self
535+
yield current_filter
536+
while current_filter.next_filter:
537+
yield current_filter.next_filter
538+
current_filter = current_filter.next_filter
539+
540+
def get_as_sql(self, prefix: str | None = None, operand: Literal["AND", "OR"] = "AND") -> str:
541+
sql_filters = [f"{prefix or ''}{f.field} = '{f.value}'" for f in self]
542+
return f"({f' {operand} '.join(sql_filters)})"
411543

412544

413545
class ScoreDefinitionResult(Base):

testgen/template/data_chars/data_chars_update.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ FROM last_run l
104104
AND d.table_name = n.table_name
105105
)
106106
WHERE data_table_chars.table_id = d.table_id
107+
AND d.drop_date IS NULL
107108
AND n.table_name IS NULL;
108109

109110
-- ==============================================================================
@@ -221,4 +222,5 @@ FROM last_run l
221222
)
222223
WHERE data_column_chars.table_id = d.table_id
223224
AND data_column_chars.column_name = d.column_name
225+
AND d.drop_date IS NULL
224226
AND n.column_name IS NULL;

testgen/template/dbsetup/030_initialize_new_schema_structure.sql

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,6 @@ CREATE TABLE auth_users (
622622
email VARCHAR(120),
623623
name VARCHAR(120),
624624
password VARCHAR(120),
625-
preauthorized BOOLEAN default false,
626625
role VARCHAR(20)
627626
);
628627

@@ -657,13 +656,23 @@ CREATE TABLE IF NOT EXISTS score_definitions (
657656
category VARCHAR(30) DEFAULT NULL
658657
);
659658

659+
CREATE TABLE IF NOT EXISTS score_definition_criteria (
660+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
661+
definition_id UUID NOT NULL REFERENCES score_definitions(id) ON DELETE CASCADE,
662+
operand VARCHAR NOT NULL DEFAULT 'AND',
663+
group_by_field BOOLEAN NOT NULL DEFAULT true
664+
);
665+
660666
CREATE TABLE IF NOT EXISTS score_definition_filters (
661-
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
662-
definition_id UUID CONSTRAINT score_definitions_filters_score_definitions_definition_id_fk
663-
REFERENCES score_definitions (id)
664-
ON DELETE CASCADE,
665-
field TEXT DEFAULT NULL,
666-
value TEXT DEFAULT NULL
667+
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
668+
criteria_id UUID DEFAULT NULL CONSTRAINT score_definitions_filters_score_definition_criteria_fk
669+
REFERENCES score_definition_criteria (id)
670+
ON DELETE CASCADE,
671+
next_filter_id UUID DEFAULT NULL CONSTRAINT score_definitions_filters_score_definitions_filters_fk
672+
REFERENCES score_definition_filters (id)
673+
ON DELETE CASCADE,
674+
field TEXT DEFAULT NULL,
675+
value TEXT DEFAULT NULL
667676
);
668677

669678
CREATE TABLE IF NOT EXISTS score_definition_results (

testgen/template/dbsetup/075_grant_role_rights.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON
3333
{SCHEMA_NAME}.data_column_chars,
3434
{SCHEMA_NAME}.auth_users,
3535
{SCHEMA_NAME}.score_definitions,
36+
{SCHEMA_NAME}.score_definition_criteria,
3637
{SCHEMA_NAME}.score_definition_filters,
3738
{SCHEMA_NAME}.score_definition_results,
3839
{SCHEMA_NAME}.score_definition_results_breakdown,

testgen/template/dbupgrade/0137_incremental_upgrade.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ SET SEARCH_PATH TO {SCHEMA_NAME};
22

33
UPDATE job_schedules
44
SET kwargs = kwargs - 'project_code' || jsonb_build_object('project_key', kwargs->'project_code')
5-
WHERE key = 'run-tests';
5+
WHERE key = 'run-tests';
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
SET SEARCH_PATH TO {SCHEMA_NAME};
2+
3+
CREATE TABLE score_definition_criteria (
4+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
5+
definition_id UUID NOT NULL REFERENCES score_definitions(id) ON DELETE CASCADE,
6+
operand VARCHAR NOT NULL DEFAULT 'AND',
7+
group_by_field BOOLEAN NOT NULL DEFAULT true
8+
);
9+
10+
ALTER TABLE score_definition_filters
11+
ADD COLUMN criteria_id UUID DEFAULT NULL,
12+
ADD COLUMN next_filter_id UUID DEFAULT NULL,
13+
ADD CONSTRAINT score_definitions_filters_score_definition_criteria_fk FOREIGN KEY (criteria_id) REFERENCES score_definition_criteria (id) ON DELETE CASCADE,
14+
ADD CONSTRAINT score_definitions_filters_score_definitions_filters_fk FOREIGN KEY (next_filter_id) REFERENCES score_definition_filters (id) ON DELETE CASCADE;
15+
16+
DO $$
17+
DECLARE
18+
current_definition_id UUID;
19+
new_criteria_id UUID;
20+
definition_filter RECORD;
21+
BEGIN
22+
FOR current_definition_id IN SELECT id FROM score_definitions LOOP
23+
new_criteria_id := gen_random_uuid();
24+
RAISE NOTICE 'Definition = %', current_definition_id;
25+
RAISE NOTICE 'Create Score Criteria (AND)';
26+
EXECUTE format(
27+
'INSERT INTO score_definition_criteria (id, definition_id, operand, group_by_field) VALUES (%L, %L, %L, %L)',
28+
new_criteria_id, current_definition_id, 'AND', true
29+
);
30+
31+
FOR definition_filter IN SELECT id, field, value FROM score_definition_filters WHERE definition_id = current_definition_id LOOP
32+
RAISE NOTICE 'Link filter to Score Criteria Field=% Value=%', definition_filter.field, definition_filter.value;
33+
EXECUTE format('UPDATE score_definition_filters SET criteria_id = %L WHERE id = %L', new_criteria_id, definition_filter.id);
34+
END LOOP;
35+
END LOOP;
36+
END $$;
37+
38+
ALTER TABLE score_definition_filters DROP COLUMN definition_id;
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SET SEARCH_PATH TO {SCHEMA_NAME};
2+
3+
ALTER TABLE auth_users DROP COLUMN preauthorized;

0 commit comments

Comments
 (0)