Skip to content

Commit 8585e39

Browse files
kenyiuccerv1
andauthored
Fix lifecycle contributor state accounting and add extra QA for sqlmesh metrics pipeline (#5177)
* feat: add first time contributor metrics and corresponding test cases * feat: update contributor_activity_classification with test included for monthly artifact * feat: add test for first time contributor monthly metrics * refactor(metrics): rename new_contributors -> first_time_contributors and update related logic/tests - Rename metric identifiers in SQL: lifecycle.sql now uses first_time_* (aliases, churn calc, and change_in fields) - Update tests to reference first_time_contributors_* and change_in_first_time_contributors_* - Remove obsolete bot_activity periodic test fixtures (monthly/quarterly/biannually) - metrics_factories: remove early skip for new_contributors so first-time metrics are audited - factory: adjust ignored incremental rule to first_time_contributors and tidy MetricsCycle error message formatting * feat: add models and tests for last contributions to artifacts, collections, and projects * feat: update contributor activity classification logic and add last contribution test data --------- Co-authored-by: ken <[email protected]> Co-authored-by: Carl Cervone <[email protected]>
1 parent 95d851e commit 8585e39

24 files changed

+1342
-1221
lines changed

warehouse/metrics_tools/factory/factory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def queue_query(name: str):
314314
parent_query = queries[table_name]["rendered_query"]
315315
raise MetricsCycle(
316316
textwrap.dedent(
317-
f"""Cycle from {name} to {table_name}:
317+
f"""Cycle from {name} to {table_name}:
318318
---
319319
{name}:
320320
{rendered_query.sql(dialect="duckdb", pretty=True)}
@@ -744,7 +744,7 @@ def generate_time_aggregation_model_for_rendered_query(
744744
or "funding" in query_config["table_name"]
745745
or "releases" in query_config["table_name"]
746746
or "worldchain_users_aggregation" in query_config["table_name"]
747-
or "new_contributors" in query_config["table_name"]
747+
or "first_time_contributors" in query_config["table_name"]
748748
):
749749
ignored_rules.append("incrementalmustdefinenogapsaudit")
750750

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
TODO: This is a hack for now to fix performance issues with the contributor
3+
classifications. We should use some kind of factory for this in the future to
4+
get all dimensions
5+
*/
6+
MODEL (
7+
name oso.int_last_contribution_to_artifact,
8+
kind FULL,
9+
partitioned_by (YEAR("time"), "event_source"),
10+
grain (time, event_source, from_artifact_id, to_artifact_id),
11+
audits (
12+
has_at_least_n_rows(threshold := 0)
13+
)
14+
);
15+
16+
SELECT
17+
MAX(time) AS time,
18+
event_source,
19+
from_artifact_id,
20+
to_artifact_id
21+
FROM oso.int_first_of_event_from_artifact__github
22+
WHERE
23+
event_type IN ('COMMIT_CODE', 'ISSUE_OPENED', 'PULL_REQUEST_OPENED', 'PULL_REQUEST_MERGED')
24+
GROUP BY
25+
event_source,
26+
from_artifact_id,
27+
to_artifact_id
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
TODO: This is a hack for now to fix performance issues with the contributor
3+
classifications. We should use some kind of factory for this in the future to
4+
get all dimensions
5+
*/
6+
MODEL (
7+
name oso.int_last_contribution_to_collection,
8+
kind FULL,
9+
partitioned_by (YEAR("time"), "event_source"),
10+
grain (time, event_source, from_artifact_id, to_collection_id),
11+
tags (
12+
'entity_category=collection'
13+
),
14+
audits (
15+
has_at_least_n_rows(threshold := 0)
16+
)
17+
);
18+
19+
SELECT
20+
MAX(time) AS time,
21+
last_contribution_to_project.event_source,
22+
last_contribution_to_project.from_artifact_id,
23+
projects_by_collection_v1.collection_id AS to_collection_id
24+
FROM oso.int_last_contribution_to_project AS last_contribution_to_project
25+
INNER JOIN oso.projects_by_collection_v1
26+
ON last_contribution_to_project.to_project_id = projects_by_collection_v1.project_id
27+
GROUP BY
28+
event_source,
29+
from_artifact_id,
30+
to_collection_id
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
TODO: This is a hack for now to fix performance issues with the contributor
3+
classifications. We should use some kind of factory for this in the future to
4+
get all dimensions
5+
*/
6+
MODEL (
7+
name oso.int_last_contribution_to_project,
8+
kind FULL,
9+
partitioned_by (YEAR("time"), "event_source"),
10+
grain (time, event_source, from_artifact_id, to_project_id),
11+
tags (
12+
'entity_category=project'
13+
),
14+
audits (
15+
has_at_least_n_rows(threshold := 0)
16+
)
17+
);
18+
19+
SELECT
20+
MAX(last_contribution_to_artifact.time) AS time,
21+
last_contribution_to_artifact.event_source,
22+
last_contribution_to_artifact.from_artifact_id,
23+
artifacts_by_project_v1.project_id AS to_project_id
24+
FROM oso.int_last_contribution_to_artifact AS last_contribution_to_artifact
25+
INNER JOIN oso.artifacts_by_project_v1
26+
ON last_contribution_to_artifact.to_artifact_id = artifacts_by_project_v1.artifact_id
27+
GROUP BY
28+
event_source,
29+
from_artifact_id,
30+
to_project_id

warehouse/oso_sqlmesh/models/metrics_factories.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ def no_gaps_audit_factory(config: MetricQueryConfig) -> MetricMetadataModifiers:
5353
if "worldchain_users_aggregation" in config["table_name"]:
5454
return {}
5555

56-
if "new_contributors" in config["table_name"]:
57-
return {}
58-
5956
if "data_category=blockchain" in config["additional_tags"]:
6057
options["ignore_before"] = constants.superchain_audit_start
6158
options["ignore_after"] = constants.superchain_audit_end
@@ -256,8 +253,8 @@ def add_project_and_collection_entity_category_tags(
256253
),
257254
additional_tags=["data_category=code"],
258255
),
259-
"new_contributors": MetricQueryDef(
260-
ref="code/new_contributors.sql",
256+
"first_time_contributor": MetricQueryDef(
257+
ref="code/first_time_contributor.sql",
261258
time_aggregations=[
262259
"daily",
263260
"weekly",
@@ -266,11 +263,10 @@ def add_project_and_collection_entity_category_tags(
266263
# "biannually",
267264
"yearly",
268265
],
269-
entity_types=["artifact", "project", "collection"],
270266
over_all_time=True,
271267
metadata=MetricMetadata(
272-
display_name="New Contributors",
273-
description="Metrics related to new GitHub contributors identified by author_association = 'FIRST_TIME_CONTRIBUTOR'",
268+
display_name="First Time Contributors",
269+
description="Metrics related to first time GitHub contributors",
274270
),
275271
additional_tags=["data_category=code"],
276272
),

0 commit comments

Comments
 (0)