Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 42 additions & 29 deletions augur/application/db/data_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,23 +644,34 @@ def extract_needed_message_data(comment: dict, platform_id: int, repo_id: int, t

return dict_data

def _extract_base_contributor_data(contributor_data, tool_source, tool_version, data_source):
"""
Helper function to extract common contributor data.
"""
return {
"cntrb_created_at": contributor_data.get('created_at'),
"cntrb_email": contributor_data.get('email'),
"cntrb_company": contributor_data.get('company'),
"cntrb_location": contributor_data.get('location'),
"cntrb_canonical": contributor_data.get('email'),
"tool_source": tool_source,
"tool_version": tool_version,
"data_source": data_source
}

def extract_needed_contributor_data(contributor, tool_source, tool_version, data_source):

if not contributor:
return None

cntrb_id = GithubUUID()
cntrb_id["user"] = contributor["id"]

base_data = _extract_base_contributor_data(contributor, tool_source, tool_version, data_source)

contributor = {
specific_data = {
"cntrb_id": cntrb_id.to_UUID(),
"cntrb_login": contributor['login'],
"cntrb_created_at": contributor['created_at'] if 'created_at' in contributor else None,
"cntrb_email": contributor['email'] if 'email' in contributor else None,
"cntrb_company": contributor['company'] if 'company' in contributor else None,
"cntrb_location": contributor['location'] if 'location' in contributor else None,
# "cntrb_type": , dont have a use for this as of now ... let it default to null
"cntrb_canonical": contributor['email'] if 'email' in contributor else None,
"gh_user_id": contributor['id'],
"gh_login": str(contributor['login']), ## cast as string by SPG on 11/28/2021 due to `nan` user
"gh_url": contributor['url'],
Expand All @@ -679,14 +690,13 @@ def extract_needed_contributor_data(contributor, tool_source, tool_version, data
"gh_received_events_url": contributor['received_events_url'],
"gh_type": contributor['type'],
"gh_site_admin": contributor['site_admin'],
"cntrb_last_used" : None if 'updated_at' not in contributor else contributor['updated_at'],
"cntrb_full_name" : None if 'name' not in contributor else contributor['name'],
"tool_source": tool_source,
"tool_version": tool_version,
"data_source": data_source
"cntrb_last_used" : contributor.get('updated_at'),
"cntrb_full_name" : contributor.get('name'),
"platform": "github",
"platform_username": contributor['login']
}

return contributor
return {**base_data, **specific_data}

def extract_needed_gitlab_contributor_data(contributor, tool_source, tool_version, data_source):

Expand All @@ -696,21 +706,19 @@ def extract_needed_gitlab_contributor_data(contributor, tool_source, tool_versio
cntrb_id = GitlabUUID()
cntrb_id["user"] = contributor["id"]

contributor = {
base_data = _extract_base_contributor_data(contributor, tool_source, tool_version, data_source)

specific_data = {
"cntrb_id": cntrb_id.to_UUID(),
"cntrb_login": contributor['username'],
"cntrb_created_at": contributor['created_at'] if 'created_at' in contributor else None,
"cntrb_email": contributor['email'] if 'email' in contributor else None,
"cntrb_company": contributor['company'] if 'company' in contributor else None,
"cntrb_location": contributor['location'] if 'location' in contributor else None,
# "cntrb_type": , dont have a use for this as of now ... let it default to null
"cntrb_canonical": contributor['email'] if 'email' in contributor else None,
"gh_user_id": contributor['id'],
"gh_login": str(contributor['username']), ## cast as string by SPG on 11/28/2021 due to `nan` user
"gh_url": contributor['web_url'],

# Nullify GitHub columns
"gh_user_id": None,
"gh_login": None,
"gh_url": None,
"gh_html_url": None,
"gh_node_id": None,
"gh_avatar_url": contributor['avatar_url'],
"gh_avatar_url": None,
"gh_gravatar_id": None,
"gh_followers_url": None,
"gh_following_url": None,
Expand All @@ -723,14 +731,19 @@ def extract_needed_gitlab_contributor_data(contributor, tool_source, tool_versio
"gh_received_events_url": None,
"gh_type": None,
"gh_site_admin": None,

"cntrb_last_used" : None,
"cntrb_full_name" : None,
"tool_source": tool_source,
"tool_version": tool_version,
"data_source": data_source

"platform": "gitlab",
"platform_username": contributor['username'],
"gl_username": contributor['username'],
"gl_id": contributor['id'],
"gl_web_url": contributor['web_url'],
"gl_avatar_url": contributor['avatar_url']
}

return contributor
return {**base_data, **specific_data}


def extract_needed_clone_history_data(clone_history_data:List[dict], repo_id:int):
Expand Down
8 changes: 8 additions & 0 deletions augur/application/db/models/augur_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ class Contributor(Base):
String,
comment="Will be a double population with the same value as gh_login for github, but the local value for other systems. ",
)
platform = Column(
String,
server_default=text("'github'::character varying"),
nullable=False
)
platform_username = Column(String)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is platform username for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

platform_username is the platform agnostic column intended to serve as unified source of truth for contributor's handle/login, regardless of source.

row with platform='github' stores login in platform_username, row with platform='gitlab' also stores login in platform_username.

this decouples our data model from specific platforms. when we add forgejo or bitbucket later, we wo not need to add forgejo_username columns, we will just use platform='forgejo' and platform_username.

cntrb_email = Column(
String,
comment="This needs to be here for matching contributor ids, which are augur, to the commit information. ",
Expand Down Expand Up @@ -280,6 +286,8 @@ def from_github(cls, contributor, tool_source, tool_version, data_source):

contributor_obj.cntrb_id = cntrb_id.to_UUID()
contributor_obj.cntrb_login = contributor['login']
contributor_obj.platform = 'github'
contributor_obj.platform_username = contributor['login']
contributor_obj.cntrb_created_at = contributor['created_at'] if 'created_at' in contributor else None
contributor_obj.cntrb_email = contributor['email'] if 'email' in contributor else None
contributor_obj.cntrb_company = contributor['company'] if 'company' in contributor else None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""add platform data to contributors

Revision ID: 39
Revises: 38
Create Date: 2026-01-16 14:00:00.000000

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.sql import text


# revision identifiers, used by Alembic.
revision = '39'
down_revision = '38'
branch_labels = None
depends_on = None


def upgrade():
# Add columns to augur_data.contributors
op.add_column('contributors', sa.Column('platform', sa.String(), nullable=False, server_default='github'), schema='augur_data')
op.add_column('contributors', sa.Column('platform_username', sa.String(), nullable=True), schema='augur_data')

# Backfill platform_username from gh_login for the default 'github' platform entries
# We use execute with text() for safe SQL execution
connection = op.get_bind()
connection.execute(text("UPDATE augur_data.contributors SET platform_username = gh_login WHERE platform = 'github'"))


def downgrade():
op.drop_column('contributors', 'platform_username', schema='augur_data')
op.drop_column('contributors', 'platform', schema='augur_data')
2 changes: 2 additions & 0 deletions augur/tasks/github/facade_github/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def query_github_contributors(logger, key_auth, github_url):
cntrb = {
"cntrb_id" : cntrb_id.to_UUID(),
"cntrb_login": contributor['login'],
"platform": "github",
"platform_username": contributor['login'],
"cntrb_created_at": contributor['created_at'],
"cntrb_email": email,
"cntrb_company": company,
Expand Down
Loading