|
| 1 | +MODEL ( |
| 2 | + name oso.int_ddp_repo_features, |
| 3 | + description "Developer Data Program repositories with features for clustering analysis", |
| 4 | + kind FULL, |
| 5 | + dialect trino, |
| 6 | + grain (artifact_id), |
| 7 | + tags ( |
| 8 | + 'entity_category=artifact' |
| 9 | + ), |
| 10 | + audits ( |
| 11 | + has_at_least_n_rows(threshold := 0) |
| 12 | + ) |
| 13 | +); |
| 14 | + |
| 15 | +-- Base metadata from DDP repos |
| 16 | +WITH base_metadata AS ( |
| 17 | + SELECT |
| 18 | + artifact_id, |
| 19 | + artifact_namespace, |
| 20 | + artifact_name, |
| 21 | + artifact_url, |
| 22 | + is_fork, |
| 23 | + created_at, |
| 24 | + updated_at, |
| 25 | + star_count, |
| 26 | + fork_count, |
| 27 | + language, |
| 28 | + is_ethereum, |
| 29 | + is_evm_l1, |
| 30 | + is_evm_l2, |
| 31 | + is_evm_stack, |
| 32 | + is_solana, |
| 33 | + ecosystem_count, |
| 34 | + is_in_ossd, |
| 35 | + is_owner_in_ossd |
| 36 | + FROM oso.int_ddp_repo_metadata |
| 37 | +), |
| 38 | + |
| 39 | +-- Lineage information |
| 40 | +lineage AS ( |
| 41 | + SELECT |
| 42 | + artifact_id, |
| 43 | + is_current_url, |
| 44 | + has_aliases, |
| 45 | + alias_count |
| 46 | + FROM oso.int_ddp_repo_lineage |
| 47 | +), |
| 48 | + |
| 49 | +-- First/last commit timestamps |
| 50 | +commit_times AS ( |
| 51 | + SELECT |
| 52 | + artifact_id, |
| 53 | + first_commit_time AS first_commit, |
| 54 | + last_commit_time AS last_commit |
| 55 | + FROM oso.int_first_last_commit_to_github_repository |
| 56 | +), |
| 57 | + |
| 58 | +-- Count repos per maintainer (using artifact_namespace) |
| 59 | +repos_per_maintainer AS ( |
| 60 | + SELECT |
| 61 | + artifact_namespace, |
| 62 | + COUNT(DISTINCT artifact_id) AS num_repos_owned_by_maintainer |
| 63 | + FROM oso.int_ddp_repo_metadata |
| 64 | + GROUP BY artifact_namespace |
| 65 | +), |
| 66 | + |
| 67 | +-- Pivot metrics from int_ddp_repo_metrics (over all time) |
| 68 | +pivoted_metrics AS ( |
| 69 | + SELECT |
| 70 | + artifact_id, |
| 71 | + MAX(CASE WHEN metric_model = 'contributors' THEN metric_amount END) AS contributor_count, |
| 72 | + MAX(CASE WHEN metric_model = 'commits' THEN metric_amount END) AS commit_count, |
| 73 | + MAX(CASE WHEN metric_model = 'forks' THEN metric_amount END) AS fork_count, |
| 74 | + MAX(CASE WHEN metric_model = 'stars' THEN metric_amount END) AS star_count, |
| 75 | + MAX(CASE WHEN metric_model = 'opened_issues' THEN metric_amount END) AS opened_issue_count, |
| 76 | + MAX(CASE WHEN metric_model = 'closed_issues' THEN metric_amount END) AS closed_issue_count, |
| 77 | + MAX(CASE WHEN metric_model = 'opened_pull_requests' THEN metric_amount END) AS opened_pull_request_count, |
| 78 | + MAX(CASE WHEN metric_model = 'merged_pull_requests' THEN metric_amount END) AS merged_pull_request_count, |
| 79 | + MAX(CASE WHEN metric_model = 'releases' THEN metric_amount END) AS release_count, |
| 80 | + MAX(CASE WHEN metric_model = 'comments' THEN metric_amount END) AS comment_count |
| 81 | + FROM oso.int_ddp_repo_metrics |
| 82 | + GROUP BY artifact_id |
| 83 | +), |
| 84 | + |
| 85 | +-- Check for packages owned by this repository |
| 86 | +repo_packages AS ( |
| 87 | + SELECT |
| 88 | + package_owner_artifact_id AS artifact_id, |
| 89 | + COUNT(DISTINCT package_artifact_id) AS package_count, |
| 90 | + CAST(TRUE AS BOOLEAN) AS has_packages |
| 91 | + FROM oso.int_packages__current_maintainer_only |
| 92 | + GROUP BY package_owner_artifact_id |
| 93 | +), |
| 94 | + |
| 95 | +-- Check if repo maintainer is a personal account or organization |
| 96 | +-- If the artifact_namespace (repo owner) is in the GitHub users table, it's a personal repo |
| 97 | +personal_repo_check AS ( |
| 98 | + SELECT |
| 99 | + bm.artifact_id, |
| 100 | + (gu.artifact_name IS NOT NULL) AS is_personal_repo |
| 101 | + FROM base_metadata AS bm |
| 102 | + LEFT JOIN oso.int_github_users AS gu |
| 103 | + ON bm.artifact_namespace = gu.artifact_name |
| 104 | +), |
| 105 | + |
| 106 | +joined AS ( |
| 107 | + -- Join all models |
| 108 | + SELECT |
| 109 | + bm.artifact_id, |
| 110 | + bm.artifact_url AS url, |
| 111 | + bm.artifact_namespace AS repo_maintainer, |
| 112 | + bm.artifact_name AS repo_name, |
| 113 | + COALESCE(prc.is_personal_repo, FALSE) AS is_personal_repo, |
| 114 | + COALESCE(rpm.num_repos_owned_by_maintainer, 0) AS num_repos_owned_by_maintainer, |
| 115 | + COALESCE(lin.is_current_url, TRUE) AS is_current_repo, |
| 116 | + COALESCE(lin.alias_count, 0) AS alias_count, |
| 117 | + COALESCE(bm.language, 'Unknown') AS language, |
| 118 | + CASE |
| 119 | + WHEN bm.is_fork THEN 'Yes' |
| 120 | + WHEN bm.is_fork IS NULL THEN 'Unknown' |
| 121 | + ELSE 'No' |
| 122 | + END AS fork_status, |
| 123 | + COALESCE(bm.created_at, ct.first_commit) AS first_activity_time, |
| 124 | + COALESCE(bm.updated_at, ct.last_commit) AS last_activity_time, |
| 125 | + -- Calculate age in months from first to last commit |
| 126 | + COALESCE(pm.contributor_count, 0) AS contributor_count, |
| 127 | + COALESCE(pm.commit_count, 0) AS commit_count, |
| 128 | + COALESCE(pm.fork_count, bm.fork_count, 0) AS fork_count, |
| 129 | + COALESCE(pm.star_count, bm.star_count, 0) AS star_count, |
| 130 | + COALESCE(pm.opened_issue_count, 0) AS opened_issue_count, |
| 131 | + COALESCE(pm.closed_issue_count, 0) AS closed_issue_count, |
| 132 | + COALESCE(pm.opened_pull_request_count, 0) AS opened_pull_request_count, |
| 133 | + COALESCE(pm.merged_pull_request_count, 0) AS merged_pull_request_count, |
| 134 | + COALESCE(pm.release_count, 0) AS release_count, |
| 135 | + COALESCE(pm.comment_count, 0) AS comment_count, |
| 136 | + COALESCE(rp.has_packages, FALSE) AS has_packages, |
| 137 | + COALESCE(rp.package_count, 0) AS package_count, |
| 138 | + -- Additional useful metadata |
| 139 | + bm.is_ethereum, |
| 140 | + bm.is_evm_l1, |
| 141 | + bm.is_evm_l2, |
| 142 | + bm.is_evm_stack, |
| 143 | + bm.is_solana, |
| 144 | + bm.ecosystem_count, |
| 145 | + bm.is_in_ossd, |
| 146 | + bm.is_owner_in_ossd |
| 147 | + FROM base_metadata AS bm |
| 148 | + LEFT JOIN lineage AS lin |
| 149 | + ON bm.artifact_id = lin.artifact_id |
| 150 | + LEFT JOIN commit_times AS ct |
| 151 | + ON bm.artifact_id = ct.artifact_id |
| 152 | + LEFT JOIN repos_per_maintainer AS rpm |
| 153 | + ON bm.artifact_namespace = rpm.artifact_namespace |
| 154 | + LEFT JOIN pivoted_metrics AS pm |
| 155 | + ON bm.artifact_id = pm.artifact_id |
| 156 | + LEFT JOIN repo_packages AS rp |
| 157 | + ON bm.artifact_id = rp.artifact_id |
| 158 | + LEFT JOIN personal_repo_check AS prc |
| 159 | + ON bm.artifact_id = prc.artifact_id |
| 160 | +) |
| 161 | + |
| 162 | +SELECT |
| 163 | + artifact_id, |
| 164 | + url, |
| 165 | + repo_maintainer, |
| 166 | + repo_name, |
| 167 | + is_personal_repo, |
| 168 | + num_repos_owned_by_maintainer, |
| 169 | + is_current_repo, |
| 170 | + alias_count, |
| 171 | + language, |
| 172 | + fork_status, |
| 173 | + first_activity_time, |
| 174 | + last_activity_time, |
| 175 | + -- Calculate age in months from first to last activity |
| 176 | + CASE |
| 177 | + WHEN first_activity_time IS NOT NULL AND last_activity_time IS NOT NULL |
| 178 | + THEN DATE_DIFF('month', first_activity_time, last_activity_time) |
| 179 | + ELSE NULL |
| 180 | + END AS age_months, |
| 181 | + contributor_count, |
| 182 | + commit_count, |
| 183 | + fork_count, |
| 184 | + star_count, |
| 185 | + opened_issue_count, |
| 186 | + closed_issue_count, |
| 187 | + opened_pull_request_count, |
| 188 | + merged_pull_request_count, |
| 189 | + release_count, |
| 190 | + comment_count, |
| 191 | + has_packages, |
| 192 | + package_count, |
| 193 | + is_ethereum, |
| 194 | + is_evm_l1, |
| 195 | + is_evm_l2, |
| 196 | + is_evm_stack, |
| 197 | + is_solana, |
| 198 | + ecosystem_count, |
| 199 | + is_in_ossd, |
| 200 | + is_owner_in_ossd |
| 201 | +FROM joined |
0 commit comments