Skip to content

Commit e8ac997

Browse files
authored
feat(sqlmesh): create a repo features model for DDP (#5575)
* feat(sqlmesh): initial DDP repo features model * fix(sqlmesh): plumb additional features into ddp repo metrics
1 parent 8d11b7e commit e8ac997

File tree

3 files changed

+268
-27
lines changed

3 files changed

+268
-27
lines changed
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
MODEL (
2+
name oso.int_ddp_repo_features,
3+
description "Developer Data Program repositories with features for clustering analysis",
4+
kind FULL,
5+
dialect trino,
6+
grain (artifact_id),
7+
tags (
8+
'entity_category=artifact'
9+
),
10+
audits (
11+
has_at_least_n_rows(threshold := 0)
12+
)
13+
);
14+
15+
-- Base metadata from DDP repos
16+
WITH base_metadata AS (
17+
SELECT
18+
artifact_id,
19+
artifact_namespace,
20+
artifact_name,
21+
artifact_url,
22+
is_fork,
23+
created_at,
24+
updated_at,
25+
star_count,
26+
fork_count,
27+
language,
28+
is_ethereum,
29+
is_evm_l1,
30+
is_evm_l2,
31+
is_evm_stack,
32+
is_solana,
33+
ecosystem_count,
34+
is_in_ossd,
35+
is_owner_in_ossd
36+
FROM oso.int_ddp_repo_metadata
37+
),
38+
39+
-- Lineage information
40+
lineage AS (
41+
SELECT
42+
artifact_id,
43+
is_current_url,
44+
has_aliases,
45+
alias_count
46+
FROM oso.int_ddp_repo_lineage
47+
),
48+
49+
-- First/last commit timestamps
50+
commit_times AS (
51+
SELECT
52+
artifact_id,
53+
first_commit_time AS first_commit,
54+
last_commit_time AS last_commit
55+
FROM oso.int_first_last_commit_to_github_repository
56+
),
57+
58+
-- Count repos per maintainer (using artifact_namespace)
59+
repos_per_maintainer AS (
60+
SELECT
61+
artifact_namespace,
62+
COUNT(DISTINCT artifact_id) AS num_repos_owned_by_maintainer
63+
FROM oso.int_ddp_repo_metadata
64+
GROUP BY artifact_namespace
65+
),
66+
67+
-- Pivot metrics from int_ddp_repo_metrics (over all time)
68+
pivoted_metrics AS (
69+
SELECT
70+
artifact_id,
71+
MAX(CASE WHEN metric_model = 'contributors' THEN metric_amount END) AS contributor_count,
72+
MAX(CASE WHEN metric_model = 'commits' THEN metric_amount END) AS commit_count,
73+
MAX(CASE WHEN metric_model = 'forks' THEN metric_amount END) AS fork_count,
74+
MAX(CASE WHEN metric_model = 'stars' THEN metric_amount END) AS star_count,
75+
MAX(CASE WHEN metric_model = 'opened_issues' THEN metric_amount END) AS opened_issue_count,
76+
MAX(CASE WHEN metric_model = 'closed_issues' THEN metric_amount END) AS closed_issue_count,
77+
MAX(CASE WHEN metric_model = 'opened_pull_requests' THEN metric_amount END) AS opened_pull_request_count,
78+
MAX(CASE WHEN metric_model = 'merged_pull_requests' THEN metric_amount END) AS merged_pull_request_count,
79+
MAX(CASE WHEN metric_model = 'releases' THEN metric_amount END) AS release_count,
80+
MAX(CASE WHEN metric_model = 'comments' THEN metric_amount END) AS comment_count
81+
FROM oso.int_ddp_repo_metrics
82+
GROUP BY artifact_id
83+
),
84+
85+
-- Check for packages owned by this repository
86+
repo_packages AS (
87+
SELECT
88+
package_owner_artifact_id AS artifact_id,
89+
COUNT(DISTINCT package_artifact_id) AS package_count,
90+
CAST(TRUE AS BOOLEAN) AS has_packages
91+
FROM oso.int_packages__current_maintainer_only
92+
GROUP BY package_owner_artifact_id
93+
),
94+
95+
-- Check if repo maintainer is a personal account or organization
96+
-- If the artifact_namespace (repo owner) is in the GitHub users table, it's a personal repo
97+
personal_repo_check AS (
98+
SELECT
99+
bm.artifact_id,
100+
(gu.artifact_name IS NOT NULL) AS is_personal_repo
101+
FROM base_metadata AS bm
102+
LEFT JOIN oso.int_github_users AS gu
103+
ON bm.artifact_namespace = gu.artifact_name
104+
),
105+
106+
joined AS (
107+
-- Join all models
108+
SELECT
109+
bm.artifact_id,
110+
bm.artifact_url AS url,
111+
bm.artifact_namespace AS repo_maintainer,
112+
bm.artifact_name AS repo_name,
113+
COALESCE(prc.is_personal_repo, FALSE) AS is_personal_repo,
114+
COALESCE(rpm.num_repos_owned_by_maintainer, 0) AS num_repos_owned_by_maintainer,
115+
COALESCE(lin.is_current_url, TRUE) AS is_current_repo,
116+
COALESCE(lin.alias_count, 0) AS alias_count,
117+
COALESCE(bm.language, 'Unknown') AS language,
118+
CASE
119+
WHEN bm.is_fork THEN 'Yes'
120+
WHEN bm.is_fork IS NULL THEN 'Unknown'
121+
ELSE 'No'
122+
END AS fork_status,
123+
COALESCE(bm.created_at, ct.first_commit) AS first_activity_time,
124+
COALESCE(bm.updated_at, ct.last_commit) AS last_activity_time,
125+
-- Calculate age in months from first to last commit
126+
COALESCE(pm.contributor_count, 0) AS contributor_count,
127+
COALESCE(pm.commit_count, 0) AS commit_count,
128+
COALESCE(pm.fork_count, bm.fork_count, 0) AS fork_count,
129+
COALESCE(pm.star_count, bm.star_count, 0) AS star_count,
130+
COALESCE(pm.opened_issue_count, 0) AS opened_issue_count,
131+
COALESCE(pm.closed_issue_count, 0) AS closed_issue_count,
132+
COALESCE(pm.opened_pull_request_count, 0) AS opened_pull_request_count,
133+
COALESCE(pm.merged_pull_request_count, 0) AS merged_pull_request_count,
134+
COALESCE(pm.release_count, 0) AS release_count,
135+
COALESCE(pm.comment_count, 0) AS comment_count,
136+
COALESCE(rp.has_packages, FALSE) AS has_packages,
137+
COALESCE(rp.package_count, 0) AS package_count,
138+
-- Additional useful metadata
139+
bm.is_ethereum,
140+
bm.is_evm_l1,
141+
bm.is_evm_l2,
142+
bm.is_evm_stack,
143+
bm.is_solana,
144+
bm.ecosystem_count,
145+
bm.is_in_ossd,
146+
bm.is_owner_in_ossd
147+
FROM base_metadata AS bm
148+
LEFT JOIN lineage AS lin
149+
ON bm.artifact_id = lin.artifact_id
150+
LEFT JOIN commit_times AS ct
151+
ON bm.artifact_id = ct.artifact_id
152+
LEFT JOIN repos_per_maintainer AS rpm
153+
ON bm.artifact_namespace = rpm.artifact_namespace
154+
LEFT JOIN pivoted_metrics AS pm
155+
ON bm.artifact_id = pm.artifact_id
156+
LEFT JOIN repo_packages AS rp
157+
ON bm.artifact_id = rp.artifact_id
158+
LEFT JOIN personal_repo_check AS prc
159+
ON bm.artifact_id = prc.artifact_id
160+
)
161+
162+
SELECT
163+
artifact_id,
164+
url,
165+
repo_maintainer,
166+
repo_name,
167+
is_personal_repo,
168+
num_repos_owned_by_maintainer,
169+
is_current_repo,
170+
alias_count,
171+
language,
172+
fork_status,
173+
first_activity_time,
174+
last_activity_time,
175+
-- Calculate age in months from first to last activity
176+
CASE
177+
WHEN first_activity_time IS NOT NULL AND last_activity_time IS NOT NULL
178+
THEN DATE_DIFF('month', first_activity_time, last_activity_time)
179+
ELSE NULL
180+
END AS age_months,
181+
contributor_count,
182+
commit_count,
183+
fork_count,
184+
star_count,
185+
opened_issue_count,
186+
closed_issue_count,
187+
opened_pull_request_count,
188+
merged_pull_request_count,
189+
release_count,
190+
comment_count,
191+
has_packages,
192+
package_count,
193+
is_ethereum,
194+
is_evm_l1,
195+
is_evm_l2,
196+
is_evm_stack,
197+
is_solana,
198+
ecosystem_count,
199+
is_in_ossd,
200+
is_owner_in_ossd
201+
FROM joined

warehouse/oso_sqlmesh/models/intermediate/domain-specific/ddp/int_ddp_repo_metadata.sql

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ WITH ossd AS (
2929
FROM oso.int_ddp_ossd_repos
3030
),
3131

32+
ossd_namespaces AS (
33+
SELECT DISTINCT artifact_namespace
34+
FROM ossd
35+
),
36+
3237
-- Filter to relevant crypto ecosystem projects and get artifact metadata
3338
ec_artifacts AS (
3439
SELECT DISTINCT
@@ -81,32 +86,55 @@ all_artifact_ids AS (
8186
SELECT artifact_id FROM ossd
8287
UNION
8388
SELECT artifact_id FROM ec_artifacts
89+
),
90+
91+
joined AS (
92+
SELECT
93+
a.artifact_id,
94+
-- Prefer OSSD metadata, fall back to crypto ecosystems metadata
95+
COALESCE(o.artifact_namespace, e.artifact_namespace) AS artifact_namespace,
96+
COALESCE(o.artifact_name, e.artifact_name) AS artifact_name,
97+
COALESCE(o.artifact_url, e.artifact_url) AS artifact_url,
98+
-- Repository metadata (only available from OSSD)
99+
o.is_fork,
100+
o.star_count,
101+
o.fork_count,
102+
o.language,
103+
o.created_at,
104+
o.updated_at,
105+
-- Ecosystem membership flags
106+
p.is_ethereum,
107+
p.is_evm_l1,
108+
p.is_evm_l2,
109+
p.is_evm_stack,
110+
p.is_solana,
111+
p.ecosystem_count
112+
FROM all_artifact_ids AS a
113+
LEFT JOIN ossd AS o
114+
ON a.artifact_id = o.artifact_id
115+
LEFT JOIN ec_artifacts AS e
116+
ON a.artifact_id = e.artifact_id
117+
LEFT JOIN ec_pivot AS p
118+
ON a.artifact_id = p.artifact_id
84119
)
85120

86121
SELECT
87-
a.artifact_id,
88-
-- Prefer OSSD metadata, fall back to crypto ecosystems metadata
89-
COALESCE(o.artifact_namespace, e.artifact_namespace) AS artifact_namespace,
90-
COALESCE(o.artifact_name, e.artifact_name) AS artifact_name,
91-
COALESCE(o.artifact_url, e.artifact_url) AS artifact_url,
92-
-- Repository metadata (only available from OSSD)
93-
o.is_fork,
94-
o.star_count,
95-
o.fork_count,
96-
o.language,
97-
o.created_at,
98-
o.updated_at,
99-
-- Ecosystem membership flags
100-
p.is_ethereum,
101-
p.is_evm_l1,
102-
p.is_evm_l2,
103-
p.is_evm_stack,
104-
p.is_solana,
105-
p.ecosystem_count
106-
FROM all_artifact_ids AS a
107-
LEFT JOIN ossd AS o
108-
ON a.artifact_id = o.artifact_id
109-
LEFT JOIN ec_artifacts AS e
110-
ON a.artifact_id = e.artifact_id
111-
LEFT JOIN ec_pivot AS p
112-
ON a.artifact_id = p.artifact_id
122+
artifact_id,
123+
artifact_namespace,
124+
artifact_name,
125+
artifact_url,
126+
is_fork,
127+
star_count,
128+
fork_count,
129+
language,
130+
created_at,
131+
updated_at,
132+
is_ethereum,
133+
is_evm_l1,
134+
is_evm_l2,
135+
is_evm_stack,
136+
is_solana,
137+
ecosystem_count,
138+
(artifact_id IN (SELECT artifact_id FROM ossd)) AS is_in_ossd,
139+
(artifact_namespace IN (SELECT artifact_namespace FROM ossd_namespaces)) AS is_owner_in_ossd
140+
FROM joined

warehouse/oso_sqlmesh/models/intermediate/domain-specific/ddp/int_ddp_repo_metrics.sql

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,16 @@ SELECT
2323
km.amount AS metric_amount
2424
FROM oso.int_ddp_repo_metadata AS a
2525
JOIN oso.key_metrics_by_artifact_v0 AS km USING (artifact_id)
26-
JOIN oso.metrics_v0 AS m USING (metric_id)
26+
JOIN oso.metrics_v0 AS m USING (metric_id)
27+
WHERE m.metric_model IN (
28+
'contributors',
29+
'commits',
30+
'forks',
31+
'stars',
32+
'opened_issues',
33+
'closed_issues',
34+
'opened_pull_requests',
35+
'merged_pull_requests',
36+
'releases',
37+
'comments'
38+
)

0 commit comments

Comments
 (0)