diff --git a/.gitignore b/.gitignore index 33deef77..b6d71c26 100755 --- a/.gitignore +++ b/.gitignore @@ -101,10 +101,17 @@ WORKLOG.md payload.json ___* gfedb.zip +reports/ +source-config-v*.json +test.asl.json +_cache*/ +ssm-describe*.json gfe-db/pipeline/jobs/build/event.json gfe-db/pipeline/statemachines/test* reports/ output-payload.json +*bkp* +execution-state.json # Local Deployment -gfe-db/local/neo4j \ No newline at end of file +gfe-db/local/neo4j diff --git a/Makefile b/Makefile index f90970e9..cbf957a1 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ export DATABASE_VOLUME_SIZE ?= 64 # Resource identifiers export DATA_BUCKET_NAME ?= ${STAGE}-${APP_NAME}-${AWS_ACCOUNT}-${AWS_REGION} +export CONFIG_S3_PATH := config export ECR_BASE_URI := ${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com export BUILD_REPOSITORY_NAME ?= ${STAGE}-${APP_NAME}-build-service export EC2_KEY_PAIR_NAME := $${STAGE}-$${APP_NAME}-$${AWS_REGION}-neo4j-key @@ -48,10 +49,8 @@ export GITHUB_REPOSITORY_OWNER ?= ANHIG export GITHUB_REPOSITORY_NAME ?= IMGTHLA export FEATURE_SERVICE_URL ?= https://feature.b12x.org -# S3 paths -export PIPELINE_STATE_PATH = config/IMGTHLA-repository-state.json -export PIPELINE_PARAMS_PATH = config/pipeline-input.json -export FUNCTIONS_PATH = ${PIPELINE_DIR}/functions +# Neo4j +export NEO4J_DATABASE_NAME ?= gfedb # Neo4j export NEO4J_DATABASE_NAME ?= gfedb @@ -65,7 +64,7 @@ export CREATE_NEO4J_USERS ?= "gfedb:7b26OqomunEQvpPG" # Required environment variables REQUIRED_VARS := STAGE APP_NAME AWS_ACCOUNT AWS_REGION AWS_PROFILE SUBSCRIBE_EMAILS \ GITHUB_REPOSITORY_OWNER GITHUB_REPOSITORY_NAME GITHUB_PERSONAL_ACCESS_TOKEN \ - ADMIN_EMAIL NEO4J_PASSWORD GDS_VERSION + ADMIN_EMAIL NEO4J_PASSWORD GDS_VERSION PYTHON BOOLEAN_VARS := CREATE_VPC USE_PRIVATE_SUBNET DEPLOY_NAT_GATEWAY DEPLOY_BASTION_SERVER DEPLOY_VPC_ENDPOINTS SKIP_CHECK_DEPENDENCIES @@ -117,6 +116,7 @@ ifeq ($(SPLASH_FONT),slant) @echo "\033[0;34m \033[0m" endif + env.print: @echo "\033[0;33mReview the contents of the .env file:\033[0m" @echo "+---------------------------------------------------------------------------------+" @@ -140,9 +140,22 @@ deploy: splash-screen logs.purge env.validate ##=> Deploy all services $(MAKE) infrastructure.deploy $(MAKE) database.deploy $(MAKE) pipeline.deploy + $(MAKE) monitoring.create-topic-subscriptions topics="GfeDbExecutionResultTopicArn UpdatePipelineErrorsTopicArn" +ifeq ($(HAS_STAGE),null) + @sh -c '$(MAKE) pipeline.state.build && $(MAKE) pipeline.state.load || echo "Pipeline state build failed"' +endif @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Finished deploying ${APP_NAME}" 2>&1 | tee -a ${CFN_LOG_PATH} $(MAKE) options-screen +update: env.validate.stage env.validate + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Updating ${APP_NAME} to ${AWS_ACCOUNT}" 2>&1 | tee -a ${CFN_LOG_PATH} + $(MAKE) env.print + @echo "Update stack in the \`${STAGE}\` environment? [y/N] \c " && read ans && [ $${ans:-N} = y ] + $(MAKE) infrastructure.deploy + $(MAKE) database.deploy + $(MAKE) pipeline.deploy + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Finished updating ${APP_NAME}" 2>&1 | tee -a ${CFN_LOG_PATH} + logs.purge: logs.dirs ifeq ($(PURGE_LOGS),true) @rm ${LOGS_DIR}/cfn/*.txt @@ -392,9 +405,7 @@ infrastructure.service.deploy: $(MAKE) -C ${APP_NAME}/infrastructure/ service.deploy infrastructure.access-services.deploy: - $(MAKE) -C ${APP_NAME}/infrastructure/access-services/nat-gateway/ deploy - $(MAKE) -C ${APP_NAME}/infrastructure/access-services/bastion-server/ deploy - $(MAKE) -C ${APP_NAME}/infrastructure/access-services/vpc-endpoints/ deploy + $(MAKE) -C ${APP_NAME}/infrastructure/ service.access-services.deploy infrastructure.access-services.nat-gateway.deploy: $(MAKE) -C ${APP_NAME}/infrastructure/access-services/nat-gateway/ deploy @@ -408,11 +419,41 @@ infrastructure.access-services.bastion-server.connect: infrastructure.access-services.vpc-endpoints.deploy: $(MAKE) -C ${APP_NAME}/infrastructure/access-services/vpc-endpoints/ deploy +monitoring.create-topic-subscriptions: #=> topics= + @for topic in $$topics; do \ + $(MAKE) monitoring.subscribe-emails topic_ssm_param=$$topic; \ + done + monitoring.create-subscriptions: $(MAKE) -C ${APP_NAME}/infrastructure service.monitoring.create-subscriptions -monitoring.subscribe-email: - $(MAKE) -C ${APP_NAME}/infrastructure service.monitoring.subscribe-email +monitoring.subscribe-emails: #=> topic_ssm_param= + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Creating SNS topic subscriptions for $$topic_ssm_param" 2>&1 | tee -a $${CFN_LOG_PATH} + @topic_arn=$$(aws ssm get-parameters \ + --names "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/$$topic_ssm_param" \ + --with-decryption \ + --query "Parameters[0].Value" \ + --output text) && \ + for EMAIL in $$(echo $${SUBSCRIBE_EMAILS} | sed 's/,/ /g'); do \ + res=$$(aws sns subscribe \ + --topic-arn "$$topic_arn" \ + --protocol email \ + --notification-endpoint "$$EMAIL") && \ + echo $$res | jq -r || \ + echo "\033[0;31mFailed to subscribe $$EMAIL to SNS topic\033[0m"; \ + done + +monitoring.subscribe-email: #=> topic_name= email= + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Creating SNS topic subscription" 2>&1 | tee -a $${CFN_LOG_PATH} + @topic_arn=$$(aws ssm get-parameters \ + --names "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/"$$topic_name"Arn" \ + --with-decryption \ + --query "Parameters[0].Value" \ + --output text) && \ + aws sns subscribe \ + --topic-arn "$$topic_arn" \ + --protocol email \ + --notification-endpoint "$$email" 2>&1 | tee -a $${CFN_LOG_PATH} || true; database.deploy: $(MAKE) -C ${APP_NAME}/database/ deploy @@ -420,6 +461,9 @@ database.deploy: database.service.deploy: $(MAKE) -C ${APP_NAME}/database/ service.deploy +database.config.deploy: + $(MAKE) -C ${APP_NAME}/database/ service.config.deploy + database.connect: ifeq ($(USE_PRIVATE_SUBNET),true) $(MAKE) infrastructure.access-services.bastion-server.connect @@ -427,6 +471,9 @@ else $(MAKE) -C ${APP_NAME}/database/ service.connect endif +database.start-session: + $(MAKE) -C ${APP_NAME}/database/ service.start-session + database.ui.connect: ifeq ($(USE_PRIVATE_SUBNET),true) $(MAKE) -C ${APP_NAME}/infrastructure/access-services/bastion-server/ service.ui.connect @@ -435,23 +482,72 @@ else ifeq ($(USE_PRIVATE_SUBNET),false) endif pipeline.deploy: - $(MAKE) -C ${APP_NAME}/pipeline/ deploy - -pipeline.service.deploy: $(MAKE) -C ${APP_NAME}/pipeline/ service.deploy +pipeline.service.update: + $(MAKE) -C ${APP_NAME}/pipeline/ service.functions.deploy + pipeline.jobs.deploy: $(MAKE) -C ${APP_NAME}/pipeline/ service.jobs.deploy -config.deploy: +pipeline.config.deploy: $(MAKE) -C ${APP_NAME}/pipeline/ service.config.deploy - $(MAKE) -C ${APP_NAME}/database/ service.config.deploy + +pipeline.state.build: + $(MAKE) -C ${APP_NAME}/pipeline/ service.state.build + +pipeline.state.load: + $(MAKE) -C ${APP_NAME}/pipeline/ service.state.load + +pipeline.state.deploy: + $(MAKE) -C ${APP_NAME}/pipeline/ service.state.build + $(MAKE) -C ${APP_NAME}/pipeline/ service.state.load + +pipeline.statemachine.update-pipeline.stop: + $(MAKE) -C ${APP_NAME}/pipeline/ service.statemachine.update-pipeline.stop + +pipeline.statemachine.load-concurrency-manager.stop: + $(MAKE) -C ${APP_NAME}/pipeline/ service.statemachine.load-concurrency-manager.stop + +pipeline.queue.gfe-db-load.purge: + $(MAKE) -C ${APP_NAME}/pipeline/ service.queue.gfe-db-load.purge + +pipeline.queue.gfe-db-processing.purge: + $(MAKE) -C ${APP_NAME}/pipeline/ service.queue.gfe-db-processing.purge + +pipeline.abort: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Aborting pipeline execution" 2>&1 | tee -a $${CFN_LOG_PATH} + $(MAKE) -C ${APP_NAME}/pipeline/ service.statemachine.update-pipeline.stop + $(MAKE) -C ${APP_NAME}/pipeline/ service.statemachine.load-concurrency-manager.stop + @purge=$${purge:-false}; \ + if [ "$$purge" = "true" ]; then \ + echo "Purging queues..."; \ + $(MAKE) -C ${APP_NAME}/pipeline/ service.queue.gfe-db-load.purge; \ + $(MAKE) -C ${APP_NAME}/pipeline/ service.queue.gfe-db-processing.purge; \ + else \ + echo "\033[0;33mNote: SQS queues were not purged. To purge queues, run with 'purge=true'.\033[0m"; \ + fi + @echo "\033[0;33m*** Pipeline execution aborted ***\033[0m" + +pipeline.alarm.update-pipeline-execution.status: + $(MAKE) -C ${APP_NAME}/pipeline/ service.alarm.update-pipeline-execution.status + +pipeline.alarm.update-pipeline-execution.wait: + $(MAKE) -C ${APP_NAME}/pipeline/ service.alarm.update-pipeline-execution.wait + +config.deploy: + $(MAKE) database.config.deploy + $(MAKE) pipeline.config.deploy database.load.run: # args: align, kir, limit, releases + @res=$$($(MAKE) database.status) && \ + echo $$res | jq -r '.State' | grep -q 'stopped' && \ + echo "\033[0;31mERROR: Database is stopped. Please start the database before loading data.\033[0m" && \ + exit 1 || true @echo "Confirm payload:" && \ [ "$$align" ] && align="$$align" || align=false && \ [ "$$kir" ] && kir="$$kir" || kir=false && \ - [ "$$limit" ] && limit="$$limit" || limit="" && \ + [ "$$limit" ] && limit="$$limit" || limit=-1 && \ [ "$$releases" ] && releases="$$releases" || releases="" && \ [ "$$use_existing_build" ] && use_existing_build="$$use_existing_build" || use_existing_build=false && \ [ "$$skip_load" ] && skip_load="$$skip_load" || skip_load=false && \ @@ -459,7 +555,7 @@ database.load.run: # args: align, kir, limit, releases echo "$$payload" | jq -r && \ echo "$$payload" | jq > payload.json @echo "Run pipeline with this payload? [y/N] \c " && read ans && [ $${ans:-N} = y ] - @function_name="${STAGE}"-"${APP_NAME}"-"$$(cat ${FUNCTIONS_PATH}/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.FunctionName')" && \ + @function_name="${STAGE}"-"${APP_NAME}"-"check-source-update" && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Invoking $$function_name..." 2>&1 | tee -a ${CFN_LOG_PATH} && \ echo "Payload:" >> ${CFN_LOG_PATH} && \ cat payload.json >> ${CFN_LOG_PATH} && \ @@ -476,7 +572,7 @@ database.load.run: # args: align, kir, limit, releases rm payload.json response.json pipeline.invoke.validation-queries: - @function_name="${STAGE}"-"${APP_NAME}"-"$$(cat ${FUNCTIONS_PATH}/environment.json | jq -r '.Functions.ExecuteValidationQueries.FunctionConfiguration.FunctionName')" && \ + @function_name="${STAGE}"-"${APP_NAME}"-"$$(cat ${PIPELINE_DIR}/functions/environment.json | jq -r '.Functions.ExecuteValidationQueries.FunctionConfiguration.FunctionName')" && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Invoking $$function_name..." 2>&1 | tee -a ${CFN_LOG_PATH} && \ aws lambda invoke \ --cli-binary-format raw-in-base64-out \ @@ -513,10 +609,6 @@ database.reboot: @response=$$(aws ec2 reboot-instances --instance-ids ${INSTANCE_ID}) && echo "$$response" $(MAKE) database.status -database.config.deploy: - @echo "Deploying \`neo4j.conf\` to $${APP_NAME} server..." - $(MAKE) -C ${APP_NAME}/database/ service.config.neo4j.deploy - database.sync-scripts: $(MAKE) -C ${APP_NAME}/database/ service.config.scripts.sync @@ -546,7 +638,7 @@ database.restore: #from_path=s3:// database.status: @aws ec2 describe-instances | \ - jq --arg iid "${INSTANCE_ID}" '.Reservations[].Instances[] | (.InstanceId == $$iid) | {InstanceId, InstanceType, "Status": .State.Name, StateTransitionReason, ImageId}' + jq --arg iid "${INSTANCE_ID}" '.Reservations[].Instances[] | select(.InstanceId == $$iid) | {InstanceId: .InstanceId, State: .State.Name}' database.get.endpoint: ifeq ($(USE_PRIVATE_SUBNET),true) @@ -585,7 +677,7 @@ delete: # data=true/false ##=> Delete services @[[ $$data != true ]] && echo "Data will not be deleted. To delete pass \`data=true\`" || true @echo "Delete all stacks from the \`${STAGE}\` environment? [y/N] \c " && read ans && [ $${ans:-N} = y ] && \ if [ "${data}" = "true" ]; then \ - aws s3 rm --recursive s3://${DATA_BUCKET_NAME}; \ + aws s3 rm --recursive --quiet s3://${DATA_BUCKET_NAME}; \ fi $(MAKE) pipeline.delete $(MAKE) database.delete @@ -616,9 +708,6 @@ database.delete: pipeline.delete: $(MAKE) -C ${APP_NAME}/pipeline/ service.delete -pipeline.service.delete: - $(MAKE) -C ${APP_NAME}/pipeline/ service.functions.delete - pipeline.jobs.delete: $(MAKE) -C ${APP_NAME}/pipeline/ service.jobs.delete @@ -755,9 +844,6 @@ define HELP_MESSAGE PIPELINE_STATE_PATH: "${PIPELINE_STATE_PATH}" Description: S3 path to the pipeline state file - PIPELINE_PARAMS_PATH: "${PIPELINE_PARAMS_PATH}" - Description: S3 path to the pipeline parameters file - FUNCTIONS_PATH: "${FUNCTIONS_PATH}" Description: Path to the Lambda functions directory @@ -799,7 +885,7 @@ define HELP_MESSAGE $ make infrastructure.access-services.bastion-server.delete ...::: Create CloudWatch subscriptions :::... - $ make monitoring.create-subscriptions + $ make monitoring.subscribe-emails ...::: Subscribe an email to CloudWatch notifications :::... $ make monitoring.subscribe-email @@ -874,10 +960,7 @@ define HELP_MESSAGE $ make pipeline.delete ...::: Update only the pipeline CloudFormation including Lambda functions :::... - $ make pipeline.service.deploy - - ...::: Delete only the pipeline CloudFormation including Lambda functions :::... - $ make pipeline.service.delete + $ make pipeline.service.update ...::: Deploy the pipeline jobs as Docker images to ECR:::... $ make pipeline.jobs.deploy diff --git a/README.md b/README.md index 98e8cfa6..b10c29dd 100755 --- a/README.md +++ b/README.md @@ -89,9 +89,8 @@ Graph database representing IPD-IMGT/HLA sequence data as GFE.    │   │   └── template.yaml    │   ├── change-batch.json    │   └── template.yaml - # Docker Build layer -    ├── local -    │   ├── Dockerfile # Dockerfile for building the Neo4j Docker image +    ├── local # Docker Build layer +    │   ├── Dockerfile # Dockerfile for building the gfe-db Docker image    │   ├── Makefile    │   └── build-local.sh # Builds the Neo4j Docker image from backup and installs plugins # Data Pipeline layer @@ -359,7 +358,7 @@ STAGE= make database.deploy STAGE= make pipeline.deploy # Deploy or update only the pipeline serverless stack including the Lambda functions and state machine -STAGE= make pipeline.service.deploy +STAGE= make pipeline.service.update # Deploy or update only the Docker image for the build job STAGE= make pipeline.jobs.deploy @@ -852,7 +851,7 @@ STAGE= make docs.build ## Authors **Primary Contact:** Martin Maiers ([@mmaiers-nmdp](https://github.com/mmaiers-nmdp))\ **Contact:** Pradeep Bashyal ([@pbashyal-nmdp](https://github.com/pbashyal-nmdp))\ -**Contact:** Gregory Lindsey ([@chrisammon3000](https://github.com/chrisammon3000)) +**Contact:** Gregory Christopher Lindsey ([@chrisammon3000](https://github.com/chrisammon3000)) ## References & Links diff --git a/gfe-db/database/Makefile b/gfe-db/database/Makefile index 003ea57a..6715289d 100644 --- a/gfe-db/database/Makefile +++ b/gfe-db/database/Makefile @@ -10,23 +10,24 @@ deploy: $(MAKE) service.deploy service.config.scripts.deploy: - @script_s3_path=s3://$$DATA_BUCKET_NAME/config/scripts/ && \ - echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying server scripts to $$script_s3_path" 2>&1 \ + @config_s3_path=s3://${DATA_BUCKET_NAME}/${CONFIG_S3_PATH}/${SERVICE}/scripts/ && \ + echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying server scripts to $$config_s3_path" 2>&1 \ | tee -a $$CFN_LOG_PATH && \ - aws s3 cp --recursive --quiet scripts/ $$script_s3_path + aws s3 cp --recursive --quiet scripts/ $$config_s3_path service.config.scripts.sync: service.config.scripts.deploy service.config.neo4j.deploy @document_name="$$(aws ssm get-parameter \ - --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/DatabaseSyncScriptsDocumentName" | jq -r '.Parameter.Value')" && \ + --name "/${APP_NAME}/${STAGE}/${AWS_REGION}/DatabaseSyncScriptsDocumentName" | jq -r '.Parameter.Value')" && \ res=$$(aws ssm send-command \ --document-name "$$document_name" \ --targets "Key=instanceids,Values=$${INSTANCE_ID}" \ - --comment "${STAGE}-${APP_NAME} sync scripts to database server") && \ + --comment "${STAGE}-${APP_NAME} sync scripts to database server" \ + --cloud-watch-output-config '{"CloudWatchOutputEnabled":true}') && \ command_id=$$(echo "$$res" | jq -r '.Command.CommandId') && \ echo "\033[0;34mWaiting for command $$command_id to complete on server $${INSTANCE_ID}\033[0m" && \ aws ssm wait command-executed \ --command-id "$$command_id" \ - --instance-id "$${INSTANCE_ID}" && \ + --instance-id "${INSTANCE_ID}" && \ aws ssm get-command-invocation \ --command-id "$$command_id" \ --instance-id "$${INSTANCE_ID}" | jq -r '.StatusDetails' | tee -a $$CFN_LOG_PATH @@ -34,14 +35,14 @@ service.config.scripts.sync: service.config.scripts.deploy service.config.neo4j. service.config.cloudwatch-agent.deploy: @mkdir -p amazon-cloudwatch-agent/tmp/ @cat amazon-cloudwatch-agent/amazon-cloudwatch-agent.json | sed "s/STAGE/$${STAGE}/g" | sed "s/APP_NAME/$${APP_NAME}/g" > amazon-cloudwatch-agent/tmp/_amazon-cloudwatch-agent.json - @config_s3_path=s3://$$DATA_BUCKET_NAME/config/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json && \ + @config_s3_path=s3://${DATA_BUCKET_NAME}/${CONFIG_S3_PATH}/${SERVICE}/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying amazon-cloudwatch-agent config to $$config_s3_path" 2>&1 \ | tee -a $$CFN_LOG_PATH && \ aws s3 cp amazon-cloudwatch-agent/tmp/_amazon-cloudwatch-agent.json $$config_s3_path @rm -rf amazon-cloudwatch-agent/tmp/ service.config.neo4j.deploy: - @aws s3 cp --recursive --quiet neo4j/cypher/ s3://$$DATA_BUCKET_NAME/config/neo4j/cypher/ 2>&1 | tee -a $$CFN_LOG_PATH + @aws s3 cp --recursive --quiet neo4j/cypher/ s3://${DATA_BUCKET_NAME}/${CONFIG_S3_PATH}/${SERVICE}/neo4j/cypher/ 2>&1 | tee -a $$CFN_LOG_PATH service.config.deploy: service.config.scripts.deploy service.config.cloudwatch-agent.deploy service.config.neo4j.deploy @@ -50,7 +51,8 @@ service.ssl.renew-cert: --document-name "AWS-RunShellScript" \ --parameters "commands=[cd /home/ec2-user && sudo make ssl.renew-cert]" \ --targets "Key=instanceids,Values=$${INSTANCE_ID}" \ - --comment "${STAGE}-${APP_NAME} SSL certificate renewal utility") && \ + --comment "${STAGE}-${APP_NAME} SSL certificate renewal utility" \ + --cloud-watch-output-config '{"CloudWatchOutputEnabled":true}') && \ command_id=$$(echo "$$res" | jq -r '.Command.CommandId') && \ aws ssm wait command-executed \ --command-id "$$command_id" \ @@ -76,20 +78,23 @@ service.deploy: Neo4jAmiId="$${NEO4J_AMI_ID}" \ Neo4jDatabaseName="$${NEO4J_DATABASE_NAME}" \ Neo4jPassword="$${NEO4J_PASSWORD}" \ + ServiceName="${SERVICE}" \ + ConfigS3Path="$${CONFIG_S3_PATH}" \ CreateNeo4jUsers="$${CREATE_NEO4J_USERS}" \ DatabaseVolumeSize="$${DATABASE_VOLUME_SIZE}" \ HostDomain="$${HOST_DOMAIN}" \ Subdomain="$${SUBDOMAIN}" \ AdminEmail="$${ADMIN_EMAIL}" \ APOCVersion="$${APOC_VERSION}" \ - GDSVersion="$${GDS_VERSION}" 2>&1 | tee -a $$CFN_LOG_PATH || true + GDSVersion="$${GDS_VERSION}" service.backup: @res=$$(aws ssm send-command \ --document-name "AWS-RunShellScript" \ --parameters "commands=[cd /home/ec2-user && sudo make neo4j.backup]" \ --targets "Key=instanceids,Values=$${INSTANCE_ID}" \ - --comment "${STAGE}=${APP_NAME} backup service") && \ + --comment "${STAGE}=${APP_NAME} backup service" \ + --cloud-watch-output-config '{"CloudWatchOutputEnabled":true}') && \ command_id=$$(echo "$$res" | jq -r '.Command.CommandId') && \ aws ssm wait command-executed \ --command-id "$$command_id" \ @@ -116,7 +121,8 @@ service.restore: #from_path=s3:// --document-name "AWS-RunShellScript" \ --parameters "commands=[cd /home/ec2-user && sudo make neo4j.restore from_path=$$from_path]" \ --targets "Key=instanceids,Values=$${INSTANCE_ID}" \ - --comment "${STAGE}=${APP_NAME} restore service") && \ + --comment "${STAGE}=${APP_NAME} restore service" \ + --cloud-watch-output-config '{"CloudWatchOutputEnabled":true}') && \ command_id=$$(echo "$$res" | jq -r '.Command.CommandId') && \ aws ssm wait command-executed \ --command-id "$$command_id" \ @@ -139,6 +145,9 @@ service.connect: echo "Connecting to Neo4j EC2 instance at $$instance_ip" && \ ssh -o "IdentitiesOnly yes" -tt -i ${ROOT_DIR}/${EC2_KEY_PAIR_NAME}.pem ec2-user@$$instance_ip +service.start-session: + @res=$$(aws ssm get-parameters --names "/${APP_NAME}/${STAGE}/${AWS_REGION}/Neo4jDatabaseInstanceId" | jq -r '.Parameters[0].Value') && \ + aws ssm start-session --target "$$res" delete: ##=> Delete resources @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${SERVICE} service" 2>&1 | tee -a $$CFN_LOG_PATH diff --git a/gfe-db/database/scripts/Makefile b/gfe-db/database/scripts/Makefile index d0fe3ebe..635cc5b2 100644 --- a/gfe-db/database/scripts/Makefile +++ b/gfe-db/database/scripts/Makefile @@ -69,7 +69,6 @@ neo4j.config.backup: @echo "Backing up Neo4j configuration to ${NEO4J_HOME}/conf/$$(date -u +'%Y-%m-%d-%H-%M')-neo4j.conf.bkp" @cp "${NEO4J_HOME}/conf/neo4j.conf" "${NEO4J_HOME}/conf/$$(date -u +'%Y-%m-%d-%H-%M')-neo4j.conf.bkp" -# TODO check if the APOC plugin is already installed for the version specified and skip if it is # https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.13.0/apoc-5.13.0-extended.jar neo4j.plugins.install-apoc-extended: @rm -f ${NEO4J_HOME}/plugins/apoc-*.jar @@ -77,7 +76,6 @@ neo4j.plugins.install-apoc-extended: @curl -L https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/${APOC_VERSION}/apoc-${APOC_VERSION}-extended.jar -O @mv apoc-${APOC_VERSION}-extended.jar ${NEO4J_HOME}/plugins/apoc-${APOC_VERSION}-extended.jar -# TODO check if the GDS plugin is already installed for the version specified and skip if it is # https://graphdatascience.ninja/neo4j-graph-data-science-2.5.5.zip neo4j.plugins.install-gds: @rm -f ${NEO4J_HOME}/plugins/neo4j-graph-data-science-*.jar @@ -98,7 +96,7 @@ neo4j.update-permissions: neo4j.init.download-scripts: @sudo -u neo4j mkdir -p ${NEO4J_HOME}/backups ${NEO4J_HOME}/cypher @echo "Fetching Cypher scripts from S3..." - @aws s3 cp --recursive s3://${DATA_BUCKET_NAME}/config/neo4j/cypher/ ${NEO4J_HOME}/cypher/ + @aws s3 cp --recursive s3://${DATA_BUCKET_NAME}/${CONFIG_S3_PATH}/${SERVICE}/neo4j/cypher/ ${NEO4J_HOME}/cypher/ neo4j.query.init: # public: neo4j+s://${SUBDOMAIN}.${HOST_DOMAIN}:7687; private: bolt://:7687 @echo "Executing initialization queries" diff --git a/gfe-db/database/scripts/load_db.sh b/gfe-db/database/scripts/load_db.sh index 4b79b291..e507cde1 100644 --- a/gfe-db/database/scripts/load_db.sh +++ b/gfe-db/database/scripts/load_db.sh @@ -7,6 +7,20 @@ if [ -z $EC2_USER_HOME ]; then echo "ERROR: EC2_USER_HOME not set" exit 1 fi +if [ -z $APP_NAME ]; then + echo "ERROR: APP_NAME not set" + exit 1 +fi + +if [ -z $STAGE ]; then + echo "ERROR: STAGE not set" + exit 1 +fi + +if [ -z $SERVICE_NAME ]; then + echo "ERROR: SERVICE_NAME not set" + exit 1 +fi if [[ -z $NEO4J_HOME ]]; then echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Neo4j not found" @@ -21,7 +35,9 @@ RELEASE=$1 # Set paths NEO4J_CYPHER_PATH=$NEO4J_HOME/cypher NEO4J_IMPORT_PATH=$NEO4J_HOME/import -S3_NEO4J_CYPHER_PATH=config/neo4j/cypher +S3_NEO4J_CYPHER_PATH=config/$SERVICE_NAME/neo4j/cypher + +# TODO Get from state payload S3_CSV_PATH=data/$RELEASE/csv if [[ -z $AWS_REGION ]]; then @@ -57,16 +73,25 @@ fi # Get most recent Cypher scripts echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Fetching most recent Cypher scripts" -aws s3 cp --recursive s3://$DATA_BUCKET_NAME/$S3_NEO4J_CYPHER_PATH/ $NEO4J_CYPHER_PATH +aws s3 cp --recursive s3://$DATA_BUCKET_NAME/$S3_NEO4J_CYPHER_PATH/ $NEO4J_CYPHER_PATH --quiet +# check error status of aws s3 cp and abort if not zero +[ $? -eq 0 ] || exit 1 +# TODO validate file was downloaded, abort if not, so that a failure signal can be sent to Step Functions # Download data to NEO4J_HOME/import echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Downloading CSV data for release $RELEASE" -aws s3 cp --recursive s3://$DATA_BUCKET_NAME/$S3_CSV_PATH/ $NEO4J_IMPORT_PATH/ +aws s3 cp --recursive s3://$DATA_BUCKET_NAME/$S3_CSV_PATH/ $NEO4J_IMPORT_PATH/ --quiet # Update Cypher load query for correct release +# TODO Change load.cyp to load.cyp.template mkdir -p $NEO4J_CYPHER_PATH/tmp/$RELEASE/ + +# TODO Use Cypher params for RELEASE instead of sed cat $NEO4J_CYPHER_PATH/load.cyp | sed "s/RELEASE/$RELEASE/g" > $NEO4J_CYPHER_PATH/tmp/$RELEASE/load.$RELEASE.cyp +# check error status of sed and abort if not zero +[ -f $NEO4J_CYPHER_PATH/tmp/$RELEASE/load.$RELEASE.cyp ] || exit 1 + echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Executing query" echo "****** Begin Cypher ******" printf "$(cat $NEO4J_CYPHER_PATH/tmp/$RELEASE/load.$RELEASE.cyp)\n" diff --git a/gfe-db/database/scripts/send_heartbeat.sh b/gfe-db/database/scripts/send_heartbeat.sh deleted file mode 100644 index 2c9d33f2..00000000 --- a/gfe-db/database/scripts/send_heartbeat.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -x - -# This script sends heartbeats back to the StepFunctions API during the task execution. - -while [ 1 ] -do - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Sending task heartbeat" - aws stepfunctions send-task-heartbeat \ - --task-token "$TASK_TOKEN" \ - --region $AWS_REGION - - # Send TaskSuccess token to StepFunctions - if [[ $? != "0" ]]; then - exit 1 - fi - - # TODO exit if StepFunctions returns activity timeout - - sleep $HEARTBEAT_INTERVAL -done diff --git a/gfe-db/database/scripts/start_task.sh b/gfe-db/database/scripts/start_task.sh index 9046d2ec..ded3f94f 100644 --- a/gfe-db/database/scripts/start_task.sh +++ b/gfe-db/database/scripts/start_task.sh @@ -1,100 +1,52 @@ #!/bin/bash -x +# Exit immediately if a command exits with a non-zero status set -e -# TODO remove the application logic to make this script agnostic +ERR_MSG=null -# Send task failure if script errors -send_result () { - if [[ $status = "SUCCESS" ]]; then - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Sending task success" - aws stepfunctions send-task-success \ - --task-token "$TASK_TOKEN" \ - --task-output "{\"status\":\"$status\"}" \ - --region $AWS_REGION - else - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Sending task failure" - aws stepfunctions send-task-failure \ - --task-token "$TASK_TOKEN" \ - --cause "$cause" \ - --error "$error" \ - --region $AWS_REGION - fi -} +source /home/ec2-user/env.sh -trap 'cause="Script failed due to error on line $LINENO. Please see logs in System Manager Run Command history for more details" && error=$? && send_result && kill 0' ERR - -export AWS_REGION=$(curl --silent http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region') - -export PARAMS=$1 -if [[ -z $PARAMS ]]; then - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - No parameters found" +if [[ -z $APP_NAME ]]; then + ERR_MSG="APP_NAME environment variable not set" + echo $ERR_MSG >&2 exit 1 -else - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Found parameters:" - echo "$PARAMS" fi -export ACTIVITY_ARN=$(echo $PARAMS | jq -r '.params.activity_arn') - -# TODO can now source APP_NAME and STAGE from env.sh -export APP_NAME=$(echo $PARAMS | jq -r '.params.app_name') - -echo "ACTIVITY_ARN=$ACTIVITY_ARN" -echo "APP_NAME=$APP_NAME" - -# Poll StepFunctions API for new activities -echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Polling for new activities..." -export ACTIVITY=$(aws stepfunctions get-activity-task \ - --activity-arn $ACTIVITY_ARN \ - --worker-name $APP_NAME \ - --region $AWS_REGION) - -echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Activity found" - -export TASK_TOKEN=$(echo $ACTIVITY | jq -r '.taskToken') -export TASK_INPUT=$(echo $ACTIVITY | jq -r '.input') +if [[ -z $STAGE ]]; then + ERR_MSG="STAGE environment variable not set" + echo $ERR_MSG >&2 + exit 1 +fi -echo "TASK_TOKEN=$TASK_TOKEN" -echo "TASK_INPUT=$TASK_INPUT" +# check that AWS_REGION is set from the environment +if [[ -z $AWS_REGION ]]; then + export AWS_REGION=$(curl --silent http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region') +fi -export RELEASE=$(echo $TASK_INPUT | jq -r '.RELEASES') -export ALIGN=$(echo $TASK_INPUT | jq -r '.ALIGN') -export KIR=$(echo $TASK_INPUT | jq -r '.KIR') +# Check for load event argument from command line +if [[ -z $1 ]]; then + ERR_MSG="No load event provided" + echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - $ERR_MSG" >&2 + exit 1 +fi -echo "RELEASE=$RELEASE" -echo "ALIGN=$ALIGN" -echo "KIR=$KIR" +load_event=$1 +# Log the load event +echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Load event: $load_event" -# Check for release argument -if [[ -z $RELEASE ]]; then - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Release version not found" - kill -1 $$ +release=$(echo "$load_event" | jq -r '.sqs.Body.input.version') +if [[ -z $release || "$release" == "null" || ! $release =~ ^[0-9]{1,4}$ ]]; then + ERR_MSG="Release version \"$release\" not found, is \"null\", or is not a 1-4 digit integer" + echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - $ERR_MSG" >&2 + exit 1 else - echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Starting load process for $RELEASE" + echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Starting load process for $release" fi -# TODO: parameterize heartbeat and set interval / 2 -export HEARTBEAT_INTERVAL=30 -bash send_heartbeat.sh & -send_heartbeat_pid=$! - # Run task - invoke load script -bash load_db.sh $RELEASE +bash load_db.sh $release TASK_EXIT_STATUS=$? -echo "$(date -u +'%Y-%m-%d %H:%M:%S.%3N') - Task exit status: $TASK_EXIT_STATUS" - -# Send TaskSuccess token to StepFunctions -if [[ $TASK_EXIT_STATUS != "0" ]]; then - status="FAILED" - error="$TASK_EXIT_STATUS" - cause="Task failed due to error on line $LINENO. Please see logs in System Manager Run Command history for more details." - send_result - kill 0 -else - status="SUCCESS" - send_result - kill $send_heartbeat_pid -fi -exit 0 +# Exit with the status of the load_db.sh script +exit $TASK_EXIT_STATUS \ No newline at end of file diff --git a/gfe-db/database/template.yaml b/gfe-db/database/template.yaml index a3f18866..5a8ff3d9 100644 --- a/gfe-db/database/template.yaml +++ b/gfe-db/database/template.yaml @@ -15,8 +15,8 @@ Parameters: usePrivateSubnet: Type: String AllowedValues: - - 'true' - - 'false' + - "true" + - "false" DataBucketName: Type: String Neo4jAmiId: @@ -45,13 +45,17 @@ Parameters: Type: String GDSVersion: Type: String + ConfigS3Path: + Type: String + ServiceName: + Type: String Conditions: UsePrivateSubnet: !Equals - !Ref usePrivateSubnet - - 'true' + - "true" UsePublicSubnet: !Equals - !Ref usePrivateSubnet - - 'false' + - "false" Resources: Neo4jCredentialsSecret: Type: AWS::SecretsManager::Secret @@ -70,11 +74,11 @@ Resources: Value: !Ref Neo4jCredentialsSecret Neo4jDatabaseInstance: Type: AWS::EC2::Instance - CreationPolicy: - ResourceSignal: - Timeout: PT5M + # CreationPolicy: + # ResourceSignal: + # Timeout: PT5M Properties: - KeyName: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/EC2KeyPairName}}' + KeyName: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/EC2KeyPairName}}" DisableApiTermination: false ImageId: !Ref Neo4jAmiId InstanceType: m5.large @@ -89,8 +93,8 @@ Resources: - AssociatePublicIpAddress: false DeviceIndex: '0' GroupSet: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId}}' - SubnetId: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jSubnetId}}' + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId}}" + SubnetId: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jSubnetId}}" UserData: !Base64 Fn::Sub: | #!/bin/bash -x @@ -160,7 +164,7 @@ Resources: echo "INFO: Installing CloudWatch Logs Agent" mkdir -p /usr/share/collectd/ touch /usr/share/collectd/types.db - aws s3 cp --quiet s3://${DataBucketName}/config/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json + aws s3 cp --quiet s3://${DataBucketName}/config/${ServiceName}/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json [ $(amazon-cloudwatch-agent-ctl -a status | jq -r '.status') = running ] && echo "INFO: CloudWatch Logs Agent is running" || \ (msg="ERROR: CloudWatch Logs Agent did not initialize correctly" && echo $msg && cfn_signal 1 "$msg" && exit 1) @@ -183,6 +187,8 @@ Resources: echo "NEO4J_HOME=$NEO4J_HOME" >> $EC2_USER_HOME/env.sh echo "STAGE=${Stage}" >> $EC2_USER_HOME/env.sh echo "APP_NAME=${AppName}" >> $EC2_USER_HOME/env.sh + echo "CONFIG_S3_PATH=${ConfigS3Path}" >> $EC2_USER_HOME/env.sh + echo "SERVICE_NAME=${ServiceName}" >> $EC2_USER_HOME/env.sh echo "USE_PRIVATE_SUBNET=$USE_PRIVATE_SUBNET" >> $EC2_USER_HOME/env.sh echo "AWS_REGION=us-east-1" >> $EC2_USER_HOME/env.sh echo "DATA_BUCKET_NAME=${DataBucketName}" >> $EC2_USER_HOME/env.sh @@ -332,7 +338,7 @@ Resources: echo "INFO: Initiliazing Neo4j database" install_neo4j echo "INFO: Downloading scripts" - aws s3 cp --quiet --recursive s3://${!DATA_BUCKET_NAME}/config/scripts/ $EC2_USER_HOME/ + aws s3 cp --quiet --recursive s3://${!DATA_BUCKET_NAME}/${!CONFIG_S3_PATH}/${!SERVICE_NAME}/scripts/ $EC2_USER_HOME/ cd $EC2_USER_HOME && make neo4j CREATE_NEO4J_USERS=${CreateNeo4jUsers} if [ ! -z "$CREATE_NEO4J_USERS" ]; then @@ -372,7 +378,7 @@ Resources: Policies: - PolicyName: !Sub ${Stage}-${AppName}-Neo4jDatabasePolicy PolicyDocument: - Version: '2012-10-17' + Version: "2012-10-17" Statement: - Effect: Allow Action: @@ -386,8 +392,8 @@ Resources: - s3:PutObjectAcl - s3:GetEncryptionConfiguration Resource: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}' - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*' + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}" + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*" - Effect: Allow Action: - secretsmanager:GetResourcePolicy @@ -408,8 +414,10 @@ Resources: - states:SendTaskHeartbeat - states:SendTaskSuccess - states:SendTaskFailure + # TODO in order to restrict permissions, the activity should be deployed on the infra layer so it can be references + # (right now it's in the pipeline layer) Resource: - - '*' + - "*" - Effect: Allow Action: - ec2:AssociateAddress @@ -425,14 +433,6 @@ Resources: Properties: Roles: - !Ref Neo4jDatabaseInstanceRole - - # # TODO move to database init script - # UsePublicSubnetConditionNeo4jDatabaseElasticIpAssociation: - # Type: AWS::EC2::EIPAssociation - # Condition: UsePublicSubnet - # Properties: - # InstanceId: !Ref Neo4jDatabaseInstance - # AllocationId: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpointAllocationId}}' Neo4jDatabaseInstanceIdParameter: Type: AWS::SSM::Parameter Properties: @@ -481,7 +481,7 @@ Resources: TargetType: /AWS::EC2::Instance UpdateMethod: Replace Content: - schemaVersion: '2.2' + schemaVersion: "2.2" description: Neo4j to S3 backup parameters: commandLine: @@ -555,7 +555,7 @@ Resources: Values: - !Ref Neo4jDatabaseInstance WindowId: !Ref Neo4jBackupMaintenanceWindow - DatabaseSyncScriptsDocument: + DatabaseSyncScriptsDocument: Type: AWS::SSM::Document Properties: DocumentType: "Command" @@ -573,7 +573,7 @@ Resources: type: "StringMap" description: !Sub "Downloads all files under the ${AppName} scripts prefix" default: - path: !Sub 'https://{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}.s3.amazonaws.com/config/scripts/' + path: !Sub 'https://{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}.s3.amazonaws.com/config/${ServiceName}/scripts/' copyCypherScripts: type: "String" description: "Copies Cypher scripts to the database" @@ -581,7 +581,11 @@ Resources: setUserPermissions: type: "String" description: "Deletes ETag files created by SSM Agent and set user permissions." - default: 'rm *.etag && find . \( -name "*.sh" -o -name "Makefile" -o -name "init" \) -exec chown ec2-user {} \; && find . \( -name "*.sh" -o -name "Makefile" -o -name "init" \) -exec chgrp ec2-user {} \;' + default: 'find . \( -name "*.sh" -o -name "Makefile" -o -name "init" \) -exec chown ec2-user {} \; && find . \( -name "*.sh" -o -name "Makefile" -o -name "init" \) -exec chgrp ec2-user {} \;' + cleanUp: + type: "String" + description: "Deletes ETag files created by SSM Agent." + default: 'rm *.etag 2>/dev/null || true' workingDirectory: type: "String" description: "Working directory" @@ -599,13 +603,14 @@ Resources: destinationPath: "{{ workingDirectory }}" - action: "aws:runShellScript" name: "runShellScript" - inputs: + inputs: runCommand: - "" - "directory=$(pwd)" - "export PATH=$PATH:$directory" - "cd {{ workingDirectory }} && {{ copyCypherScripts }}" - - " {{ setUserPermissions }} " + - " {{ setUserPermissions }} " + - "{{ cleanUp }}" - "" workingDirectory: "{{ workingDirectory }}" timeoutSeconds: "{{ executionTimeout }}" @@ -613,7 +618,26 @@ Resources: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DatabaseSyncScriptsDocumentName' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/DatabaseSyncScriptsDocumentName" Description: "Name of SSM document for syncing shell scripts to the database" Value: !Ref DatabaseSyncScriptsDocument - \ No newline at end of file + GfedbDatabaseParamMappingsParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/GfedbDatabaseParamMappings" + Description: !Sub "SSM Parameter and SecretsManager parameter paths for ${AppName} database layer" + Value: !Sub | + { + "ssm": [ + "/${AppName}/${Stage}/${AWS::Region}/Neo4jCredentialsSecretArn", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupName", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseInstanceId", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jUri", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jBackupDocumentName", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jBackupMaintenanceWindowId" + ], + "secretsmanager": [ + "/${AppName}/${Stage}/${AWS::Region}/Neo4jCredentials" + ] + } diff --git a/gfe-db/infrastructure/Makefile b/gfe-db/infrastructure/Makefile index 5cc45773..94903c2c 100644 --- a/gfe-db/infrastructure/Makefile +++ b/gfe-db/infrastructure/Makefile @@ -43,7 +43,6 @@ ifeq ($(USE_PRIVATE_SUBNET),true) else ifeq ($(USE_PRIVATE_SUBNET),false) $(MAKE) service.deploy.update-dns endif - $(MAKE) service.monitoring.create-subscriptions service.deploy.update-dns: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Updating DNS records" 2>&1 | tee -a $${CFN_LOG_PATH} @@ -71,14 +70,18 @@ service.key-pair.create: ##=> Checks if the key pair already exists and creates echo "Creating EC2 key pair \"${EC2_KEY_PAIR_NAME}\"" && \ aws ec2 create-key-pair --key-name ${EC2_KEY_PAIR_NAME} | jq -r '.KeyMaterial' > ${ROOT_DIR}/${EC2_KEY_PAIR_NAME}.pem && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Storing key pair ${EC2_KEY_PAIR_NAME} in Secrets Manager" 2>&1 | tee -a $$CFN_LOG_PATH && \ - aws secretsmanager create-secret --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/EC2KeyPair" \ + res=$$(aws secretsmanager create-secret --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/EC2KeyPair" \ --description "EC2 key pair for $${STAGE}-$${APP_NAME} database" \ - --secret-string "$$(cat ${ROOT_DIR}/${EC2_KEY_PAIR_NAME}.pem)" 2>&1 | tee -a $$CFN_LOG_PATH || true && \ - aws ssm put-parameter \ + --secret-string "$$(cat ${ROOT_DIR}/${EC2_KEY_PAIR_NAME}.pem)") && \ + echo $$res | jq -r && \ + res=$$(aws ssm put-parameter \ --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/EC2KeyPairName" \ --type "String" \ --value "${EC2_KEY_PAIR_NAME}" \ - --overwrite 2>&1 | tee -a $$CFN_LOG_PATH || true + --overwrite) && \ + echo $$res | jq -r && \ + echo "\033[0;34mSuccessfully created EC2 key pair\033[0m" || \ + echo "\033[0;31mFailed to create EC2 key pair\033[0m" service.deploy: @aws cloudformation deploy \ @@ -192,20 +195,27 @@ service.access-services.vpc-endpoints.delete: $(MAKE) -C access-services/vpc-endpoints delete service.delete: - @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting CloudFormation" 2>&1 | tee -a $${CFN_LOG_PATH} + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${SERVICE} CloudFormation" 2>&1 | tee -a $${CFN_LOG_PATH} @aws cloudformation delete-stack \ --stack-name "$${STAGE}-$${APP_NAME}-${SERVICE}" 2>&1 | tee -a $${CFN_LOG_PATH} || true && \ - aws cloudformation wait stack-delete-complete \ - --stack-name "$${STAGE}-$${APP_NAME}-${SERVICE}" 2>&1 | tee -a $${CFN_LOG_PATH} || true + res=$$(aws cloudformation wait stack-delete-complete \ + --stack-name "$${STAGE}-$${APP_NAME}-${SERVICE}") && \ + echo $$res | jq -r && \ + echo "\033[0;34mSuccessfully deleted ${SERVICE} CloudFormation\033[0m" || \ + echo "\033[0;31mFailed to delete ${SERVICE} CloudFormation\033[0m" service.key-pair.delete: - @aws ec2 delete-key-pair --key-name "${EC2_KEY_PAIR_NAME}" 2>&1 | tee -a $$CFN_LOG_PATH || true && \ + @res=$$(aws ec2 delete-key-pair --key-name "${EC2_KEY_PAIR_NAME}") && \ + echo $$res | jq -r && \ aws ssm delete-parameter --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/EC2KeyPairName" 2>&1 | tee -a $$CFN_LOG_PATH || true && \ mv ${ROOT_DIR}/${EC2_KEY_PAIR_NAME}.pem ${ROOT_DIR}/deprecated-key-$${INSTANCE_ID}-$$(gdate -u +'%Y-%m-%d-%H-%M').pem || true && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting key pair ${EC2_KEY_PAIR_NAME} in Secrets Manager" 2>&1 | tee -a $$CFN_LOG_PATH && \ - aws secretsmanager delete-secret \ + res=$$(aws secretsmanager delete-secret \ --secret-id "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/EC2KeyPair" \ - --force-delete-without-recovery 2>&1 | tee -a $$CFN_LOG_PATH || true + --force-delete-without-recovery) && \ + echo $$res | jq -r && \ + echo "\033[0;34mSuccessfully deleted key pair ${EC2_KEY_PAIR_NAME}\033[0m" || \ + echo "\033[0;31mFailed to delete key pair ${EC2_KEY_PAIR_NAME}\033[0m" service.parameters.delete: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting SSM parameters" 2>&1 | tee -a ${CFN_LOG_PATH} diff --git a/gfe-db/infrastructure/access-services/bastion-server/Makefile b/gfe-db/infrastructure/access-services/bastion-server/Makefile index 996cabb0..2997b67f 100644 --- a/gfe-db/infrastructure/access-services/bastion-server/Makefile +++ b/gfe-db/infrastructure/access-services/bastion-server/Makefile @@ -4,7 +4,6 @@ target: $(info ${HELP_MESSAGE}) @exit 0 -# TODO parameterize IP address for security group deploy: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying ${SERVICE} service" 2>&1 | tee -a $$CFN_LOG_PATH $(MAKE) service.deploy @@ -80,11 +79,14 @@ service.revoke-security-group-ingress: --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/BastionHostSecurityGroupId" | jq -r '.Parameter.Value')" && \ neo4j_database_security_group_id="$$(aws ssm get-parameter \ --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/Neo4jDatabaseSecurityGroupId" | jq -r '.Parameter.Value')" && \ - aws ec2 revoke-security-group-ingress \ + res=$$(aws ec2 revoke-security-group-ingress \ --group-id "$$neo4j_database_security_group_id" \ --protocol tcp \ --port 22 \ - --source-group "$$bastion_host_security_group_id" 2>&1 | tee -a $$CFN_LOG_PATH || true + --source-group "$$bastion_host_security_group_id") && \ + echo $$res | jq -r && \ + echo "\033[0;34mBastion host security group is no longer associated with Neo4j database security group\033[0m" || \ + echo "\033[0;31mFailed to remove bastion host security group from Neo4j database security group\033[0m" service.delete: service.revoke-security-group-ingress @aws cloudformation delete-stack \ diff --git a/gfe-db/infrastructure/template.yaml b/gfe-db/infrastructure/template.yaml index 52201dae..29102c88 100644 --- a/gfe-db/infrastructure/template.yaml +++ b/gfe-db/infrastructure/template.yaml @@ -39,7 +39,7 @@ Parameters: GitHubPersonalAccessToken: Type: String NoEcho: true - + Conditions: # Important: All stacks (infra, database, pipeline) should use the same conditions CreateVpc: !Equals [!Ref createVpc, 'true'] @@ -72,12 +72,11 @@ Mappings: AvailabilityZone: eu-west-3a Resources: - StageParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Stage' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/Stage" Description: "Stage of production" Value: !Ref Stage @@ -85,7 +84,7 @@ Resources: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/AppName' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/AppName" Description: "Name of application" Value: !Ref AppName @@ -106,17 +105,18 @@ Resources: EnableDnsSupport: true Tags: - Key: Name - Value: !Sub '${Stage}-${AppName}-${AWS::Region}-vpc' - + Value: !Sub "${Stage}-${AppName}-${AWS::Region}-vpc" + CreateVpcConditionPublicSubnet: Type: AWS::EC2::Subnet Condition: CreateVpc Properties: CidrBlock: 10.0.0.0/24 VpcId: !Ref CreateVpcConditionVpc - AvailabilityZone: !FindInMap [AvailabilityZoneMap, !Ref AWS::Region, AvailabilityZone] + AvailabilityZone: + !FindInMap [AvailabilityZoneMap, !Ref AWS::Region, AvailabilityZone] MapPublicIpOnLaunch: true - + CreateVpcPrivateConditionPrivateSubnet: Type: AWS::EC2::Subnet Condition: CreateVpcPrivate @@ -124,11 +124,11 @@ Resources: CidrBlock: 10.0.1.0/24 VpcId: !Ref CreateVpcConditionVpc AvailabilityZone: !GetAtt CreateVpcConditionPublicSubnet.AvailabilityZone - + CreateVpcConditionInternetGateway: Type: AWS::EC2::InternetGateway Condition: CreateVpc - + CreateVpcConditionPublicRouteTable: Type: AWS::EC2::RouteTable Condition: CreateVpc @@ -162,7 +162,7 @@ Resources: Properties: VpcId: !Ref CreateVpcConditionVpc InternetGatewayId: !Ref CreateVpcConditionInternetGateway - + CreateVpcConditionPublicRoute: Type: AWS::EC2::Route Condition: CreateVpc @@ -172,7 +172,7 @@ Resources: RouteTableId: !Ref CreateVpcConditionPublicRouteTable DestinationCidrBlock: 0.0.0.0/0 GatewayId: !Ref CreateVpcConditionInternetGateway - + CreateVpcConditionPublicSubnetRouteTableAssociation: Type: AWS::EC2::SubnetRouteTableAssociation Condition: CreateVpc @@ -195,41 +195,41 @@ Resources: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/VpcId' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/VpcId" Description: !Sub "Name of VPC network for ${AppName}" - Value: !If + Value: !If - CreateVpc - !Ref CreateVpcConditionVpc - !Ref VpcId - + PublicSubnetIdParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/PublicSubnetId' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/PublicSubnetId" Description: !Sub "Public Subnet for the ${AppName} Neo4j server" Value: !If - CreateVpc - !Ref CreateVpcConditionPublicSubnet - !Ref PublicSubnetId - + UsePrivateSubnetConditionPrivateSubnetIdParameter: Type: AWS::SSM::Parameter Condition: UsePrivateSubnet Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/PrivateSubnetId' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/PrivateSubnetId" Description: !Sub "Private Subnet for the ${AppName} Neo4j server" Value: !If - CreateVpcPrivate - !Ref CreateVpcPrivateConditionPrivateSubnet - !Ref PrivateSubnetId - + Neo4jSubnetIdParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jSubnetId' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/Neo4jSubnetId" Description: !Sub "Public Subnet for the ${AppName} Neo4j server" Value: !If - CreateVpc @@ -241,7 +241,7 @@ Resources: - UsePrivateSubnet - !Ref PrivateSubnetId - !Ref PublicSubnetId - + DataBucket: Type: AWS::S3::Bucket Properties: @@ -251,23 +251,23 @@ Resources: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DataBucketName' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/DataBucketName" Description: "Name of gfe-db data bucket" Value: !Ref DataBucket - + DataBucketArnParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DataBucketArn' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/DataBucketArn" Description: "ARN of gfe-db data bucket" Value: !GetAtt DataBucket.Arn - + DataBucketRegionalDomainNameParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DataBucketRegionalDomainName' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/DataBucketRegionalDomainName" Description: "S3 Bucket Regional Domain name for application bucket" Value: !GetAtt DataBucket.RegionalDomainName @@ -300,7 +300,7 @@ Resources: Neo4jDatabaseSecurityGroup: Type: AWS::EC2::SecurityGroup Properties: - GroupName: !Sub '${Stage}-${AppName}-neo4j-sg' + GroupName: !Sub "${Stage}-${AppName}-neo4j-sg" GroupDescription: Security group for the GFE database VpcId: !If - CreateVpc @@ -339,13 +339,13 @@ Resources: Description: Update IP to use MyIP Tags: - Key: Name - Value: !Sub '${Stage}-${AppName}-neo4j-sg' + Value: !Sub "${Stage}-${AppName}-neo4j-sg" Neo4jDatabaseSecurityGroupIdParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId" Description: "Name of the Neo4jDatabaseSecurityGroup" Value: !Ref Neo4jDatabaseSecurityGroup @@ -365,15 +365,15 @@ Resources: Domain: vpc Tags: - Key: Name - Value: !Sub '${Stage}-${AppName}-neo4j' + Value: !Sub "${Stage}-${AppName}-neo4j" UsePublicSubnetConditionNeo4jDatabaseEndpointParameter: Type: AWS::SSM::Parameter Condition: UsePublicSubnet Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpoint' - Description: !Sub 'Endpoint for ${AppName} Neo4j server' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpoint" + Description: !Sub "Endpoint for ${AppName} Neo4j server" Value: !Ref UsePublicSubnetConditionNeo4jDatabaseElasticIp UsePublicSubnetConditionNeo4jDatabaseEndpointAllocationIdParameter: @@ -381,72 +381,26 @@ Resources: Condition: UsePublicSubnet Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpointAllocationId' - Description: !Sub 'AllocationId for ${AppName} Neo4j server static IP' + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpointAllocationId" + Description: !Sub "AllocationId for ${AppName} Neo4j server static IP" Value: !GetAtt UsePublicSubnetConditionNeo4jDatabaseElasticIp.AllocationId - DataPipelineErrorsTopicPolicy: - Type: AWS::SNS::TopicPolicy - Properties: - Topics: - - !Ref DataPipelineErrorsTopic - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: cloudwatch.amazonaws.com - Action: sns:Publish - Resource: !Ref DataPipelineErrorsTopic - Condition: - StringEquals: - aws:SourceAccount: !Ref AWS::AccountId - - DataPipelineErrorsTopic: - Type: AWS::SNS::Topic - Properties: - DisplayName: !Sub "${AppName} Data Pipeline Errors" - Subscription: - - Endpoint: !Ref AdminEmail - Protocol: email - - DataPipelineErrorsTopicArnParameter: - Type: AWS::SSM::Parameter - Properties: - Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DataPipelineErrorsTopicArn' - Description: !Sub 'ARN for ${AppName} Data Pipeline Errors SNS topic' - Value: !Ref DataPipelineErrorsTopic - - DataPipelineExecutionTopicPolicy: - Type: AWS::SNS::TopicPolicy - Properties: - Topics: - - !Ref DataPipelineExecutionTopic - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: cloudwatch.amazonaws.com - Action: sns:Publish - Resource: !Ref DataPipelineExecutionTopic - Condition: - StringEquals: - aws:SourceAccount: !Ref AWS::AccountId - - DataPipelineExecutionTopic: - Type: AWS::SNS::Topic - Properties: - DisplayName: !Sub "${AppName} Data Pipeline Errors" - Subscription: - - Endpoint: !Ref AdminEmail - Protocol: email - - DataPipelineExecutionTopicArnParameter: + GfedbInfrastructureParamMappingsParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/DataPipelineExecutionTopicArn' - Description: !Sub 'ARN for ${AppName} Data Pipeline Errors SNS topic' - Value: !Ref DataPipelineExecutionTopic + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/GfedbInfrastructureParamMappings" + Description: !Sub "SSM Parameter and SecretsManager parameter paths for ${AppName} infrastructure layer" + Tier: Standard + Value: !Sub | + { + "ssm": [ + "/${AppName}/${Stage}/${AWS::Region}/VpcID", + "/${AppName}/${Stage}/${AWS::Region}/PublicSubnetID", + "/${AppName}/${Stage}/${AWS::Region}/DataBucketName", + "/${AppName}/${Stage}/${AWS::Region}/DataBucketArn", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpoint", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseEndpointAllocationId" + ] + } + \ No newline at end of file diff --git a/gfe-db/pipeline/Makefile b/gfe-db/pipeline/Makefile index 17572b7e..56f84ae1 100644 --- a/gfe-db/pipeline/Makefile +++ b/gfe-db/pipeline/Makefile @@ -1,17 +1,18 @@ SERVICE := pipeline +SCRIPTS_DIR := ${ROOT_DIR}/${APP_NAME}/${SERVICE}/scripts target: $(info ${HELP_MESSAGE}) @exit 0 # TODO: Don't deploy jobs if pipeline stack fails to create (exit Make) -deploy: +service.deploy: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying ${SERVICE} service" 2>&1 | tee -a $$CFN_LOG_PATH $(MAKE) service.config.deploy - $(MAKE) service.deploy + $(MAKE) service.functions.deploy $(MAKE) service.jobs.deploy -service.deploy: +service.functions.deploy: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying ${SERVICE} - functions" 2>&1 | tee -a $$CFN_LOG_PATH @sam build \ --region "$${AWS_REGION}" \ @@ -25,53 +26,153 @@ service.deploy: --no-fail-on-empty-changeset \ --region "$${AWS_REGION}" \ --template-file packaged.yaml \ + --resolve-s3 \ --stack-name "$${STAGE}-$${APP_NAME}-${SERVICE}" \ --tags stage="$${STAGE}" app="$${APP_NAME}" service="${SERVICE}" branch="$$(git branch --show-current)" commit=$$(git rev-parse HEAD) \ --capabilities CAPABILITY_IAM \ --parameter-overrides \ Stage="$${STAGE}" \ AppName="$${APP_NAME}" \ + ServiceName="${SERVICE}" \ createVpc="$${CREATE_VPC}" \ usePrivateSubnet="$${USE_PRIVATE_SUBNET}" \ + ConfigS3Path="$${CONFIG_S3_PATH}" \ GitHubRepositoryOwner="${GITHUB_REPOSITORY_OWNER}" \ GitHubRepositoryName="${GITHUB_REPOSITORY_NAME}" \ - InvokePipelineFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.FunctionName')" \ - InvokePipelineFunctionSchedule="$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.InvokePipelineFunctionSchedule')" \ - InvokePipelineFunctionMemorySize="$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.MemorySize')" \ - InvokePipelineFunctionTimeout="$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.Timeout')" \ - PipelineStatePath="$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.Environment.Variables.PIPELINE_STATE_PATH')" \ - PipelineParamsPath="$$(cat functions/environment.json | jq -r '.Functions.InvokePipeline.FunctionConfiguration.Environment.Variables.PIPELINE_PARAMS_PATH')" \ - ExecuteValidationQueriesFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.ExecuteValidationQueries.FunctionConfiguration.FunctionName')" \ - ExecuteValidationQueriesFunctionMemorySize="$$(cat functions/environment.json | jq -r '.Functions.ExecuteValidationQueries.FunctionConfiguration.MemorySize')" \ - ExecuteValidationQueriesFunctionTimeout="$$(cat functions/environment.json | jq -r '.Functions.ExecuteValidationQueries.FunctionConfiguration.Timeout')" \ - InvokeBackupScriptFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.InvokeBackupScript.FunctionConfiguration.FunctionName')" \ - InvokeLoadScriptFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.InvokeLoadScript.FunctionConfiguration.FunctionName')" \ - ValidateBuildOutputFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.ValidateBuildOutput.FunctionConfiguration.FunctionName')" \ - DisableBackupFunctionName="$${STAGE}"-"$${APP_NAME}"-"$$(cat functions/environment.json | jq -r '.Functions.DisableBackup.FunctionConfiguration.FunctionName')" \ + GitHubPersonalAccessToken="$$GITHUB_PERSONAL_ACCESS_TOKEN" \ ECRBaseUri="${ECR_BASE_URI}" \ BuildServiceRepositoryName="${BUILD_REPOSITORY_NAME}" \ FeatureServiceUrl="${FEATURE_SERVICE_URL}" \ - Ec2KeyPairName="${EC2_KEY_PAIR_NAME}" \ - 2>&1 | tee -a $$CFN_LOG_PATH || true + Ec2KeyPairName="${EC2_KEY_PAIR_NAME}" service.jobs.deploy: $(MAKE) -C jobs/ deploy +# TODO handle virtual environment creation and activation +service.state.build: + @${PYTHON} ${SCRIPTS_DIR}/state/build.py ${ROOT_DIR}/${APP_NAME}/${SERVICE}/config/ + +service.state.load: + @${PYTHON} ${SCRIPTS_DIR}/state/load.py ${ROOT_DIR}/${APP_NAME}/${SERVICE}/config/ + +# TODO parameterize S3 config path and export as environment variable to recall in database shell scripts +# TODO integrate and automate the build/load process for source config and execution state service.config.deploy: $(MAKE) service.config.pipeline-params.deploy - + service.config.pipeline-params.deploy: - @config_s3_path=s3://$$DATA_BUCKET_NAME/config/pipeline/ && \ + @config_s3_path=s3://${DATA_BUCKET_NAME}/${CONFIG_S3_PATH}/${SERVICE} && \ echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying config to $$config_s3_path" 2>&1 | tee -a $$CFN_LOG_PATH && \ aws s3 cp --recursive config/ $$config_s3_path 2>&1 | tee -a $$CFN_LOG_PATH +service.statemachine.update-pipeline.stop: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Stopping UpdatePipeline state machine" 2>&1 | tee -a $${CFN_LOG_PATH} + @state_machine_arn=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/UpdatePipelineStateMachineArn" \ + --query "Parameter.Value" \ + --output text) && \ + executions=$$(aws stepfunctions list-executions \ + --state-machine-arn "$$state_machine_arn" \ + --status-filter RUNNING \ + --query "executions[*].executionArn" \ + --output text) && \ + if [ -n "$$executions" ]; then \ + for execution in $$executions; do \ + res=$$(aws stepfunctions stop-execution --execution-arn "$$execution") && \ + echo "$$res" | jq -r && \ + echo "\033[0;32mStopped execution:\033[0m" && \ + echo "\033[0;32m$$execution\033[0m"; \ + done; \ + else \ + echo "\033[0;32mNo running executions found for UpdatePipeline state machine\033[0m"; \ + fi + +service.statemachine.load-concurrency-manager.stop: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Stopping LoadConcurrencyManager state machine" 2>&1 | tee -a $${CFN_LOG_PATH} + @state_machine_arn=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/LoadConcurrencyManagerStateMachineArn" \ + --query "Parameter.Value" \ + --output text) && \ + executions=$$(aws stepfunctions list-executions \ + --state-machine-arn "$$state_machine_arn" \ + --status-filter RUNNING \ + --query "executions[*].executionArn" \ + --output text) && \ + if [ -n "$$executions" ]; then \ + for execution in $$executions; do \ + res=$$(aws stepfunctions stop-execution --execution-arn "$$execution") && \ + echo "$$res" | jq -r && \ + echo "\033[0;32mStopped execution:\033[0m" && \ + echo "\033[0;32m$$execution\033[0m"; \ + done; \ + else \ + echo "\033[0;32mNo running executions found for LoadConcurrencyManager state machine\033[0m"; \ + fi + +service.queue.gfe-db-load.purge: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Purging GfeDbLoadQueue" 2>&1 | tee -a $${CFN_LOG_PATH} + @queue_url=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/GfeDbLoadQueueUrl" \ + --query "Parameter.Value" \ + --output text) && \ + res=$$(aws sqs purge-queue --queue-url "$$queue_url") && \ + echo "$$res" | jq -r 2>&1 | tee -a $${CFN_LOG_PATH} && \ + echo "\033[0;32mGfeDbLoadQueue purged successfully\033[0m" + +service.queue.gfe-db-processing.purge: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Purging GfeDbProcessingQueue" 2>&1 | tee -a $${CFN_LOG_PATH} + @queue_url=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/GfeDbProcessingQueueUrl" \ + --query "Parameter.Value" \ + --output text) && \ + res=$$(aws sqs purge-queue --queue-url "$$queue_url") && \ + echo "$$res" | jq -r 2>&1 | tee -a $${CFN_LOG_PATH} && \ + echo "\033[0;32mGfeDbProcessingQueue purged successfully\033[0m" + +service.alarm.update-pipeline-execution.status: + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Fetching status of UpdatePipelineStateMachineExecutionAlarm" 2>&1 | tee -a $${CFN_LOG_PATH} + @alarm_name=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/UpdatePipelineStateMachineExecutionAlarmName" \ + --query "Parameter.Value" \ + --output text) && \ + alarm_status=$$(aws cloudwatch describe-alarms \ + --alarm-names "$$alarm_name" \ + --query "MetricAlarms[0].StateValue" \ + --output text) && \ + echo "\033[0;32mUpdatePipelineStateMachineExecutionAlarm status: $$alarm_status\033[0m" 2>&1 | tee -a $${CFN_LOG_PATH} + +service.alarm.update-pipeline-execution.wait: # arg: poll_interval (default: 10) + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Waiting for UpdatePipelineStateMachineExecutionAlarm status change" 2>&1 | tee -a $${CFN_LOG_PATH} + @poll_interval=$${1:-10}; \ + alarm_name=$$(aws ssm get-parameter \ + --name "/$${APP_NAME}/$${STAGE}/$${AWS_REGION}/UpdatePipelineStateMachineExecutionAlarmName" \ + --query "Parameter.Value" \ + --output text) && \ + initial_status=$$(aws cloudwatch describe-alarms \ + --alarm-names "$$alarm_name" \ + --query "MetricAlarms[0].StateValue" \ + --output text) && \ + echo "Initial status: $$initial_status" && \ + while true; do \ + current_status=$$(aws cloudwatch describe-alarms \ + --alarm-names "$$alarm_name" \ + --query "MetricAlarms[0].StateValue" \ + --output text) && \ + if [ "$$current_status" != "$$initial_status" ]; then \ + echo "\033[0;32mAlarm status changed to: $$current_status\033[0m" 2>&1 | tee -a $${CFN_LOG_PATH} && \ + break; \ + fi; \ + echo "Current status: $$current_status. Waiting for $$poll_interval seconds..." && \ + sleep $$poll_interval; \ + done + service.delete: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${SERVICE} service" 2>&1 | tee -a $$CFN_LOG_PATH $(MAKE) service.jobs.delete $(MAKE) service.functions.delete service.functions.delete: - @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting CloudFormation" 2>&1 | tee -a $$CFN_LOG_PATH + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${SERVICE} CloudFormation" 2>&1 | tee -a $$CFN_LOG_PATH @aws cloudformation delete-stack \ --stack-name "$${STAGE}-$${APP_NAME}-${SERVICE}" 2>&1 | tee -a $$CFN_LOG_PATH || true && \ aws cloudformation wait stack-delete-complete \ diff --git a/gfe-db/pipeline/config/IMGTHLA-repository-state.json b/gfe-db/pipeline/config/IMGTHLA-repository-state.json deleted file mode 100644 index 4083380e..00000000 --- a/gfe-db/pipeline/config/IMGTHLA-repository-state.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "timestamp": "2021-12-09 02:36:59", - "repository_url": "https://github.com/ANHIG/IMGTHLA", - "releases": [ - "3100", - "3110", - "3120", - "3130", - "3140", - "3150", - "3160", - "3170", - "3180", - "3190", - "3200", - "3210", - "3220", - "3230", - "3240", - "3250", - "3260", - "3270", - "3280", - "3290", - "3300", - "3310", - "3320", - "3330", - "3340", - "3350", - "3360", - "3370", - "3380", - "3390", - "3400", - "3410", - "3420", - "3430", - "3440", - "3450", - "3460", - "3470", - "3480", - "3490", - "3500", - "3510", - "3520", - "3530", - "3540", - "3550", - "3560" - ] -} diff --git a/gfe-db/pipeline/config/pipeline-input.json b/gfe-db/pipeline/config/pipeline-input.json deleted file mode 100644 index cb51a1ed..00000000 --- a/gfe-db/pipeline/config/pipeline-input.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "align": false, - "kir": false, - "mem_profile": false, - "limit": "" -} diff --git a/gfe-db/pipeline/config/source-config.json b/gfe-db/pipeline/config/source-config.json new file mode 100644 index 00000000..24d98014 --- /dev/null +++ b/gfe-db/pipeline/config/source-config.json @@ -0,0 +1,63 @@ +{ + "created_utc": "2024-09-27T23:49:29.030Z", + "updated_utc": "2024-09-27T23:49:29.030Z", + "repositories": { + "ANHIG/IMGTHLA": { + "owner": "ANHIG", + "name": "IMGTHLA", + "description": "Github for files currently published in the IPD-IMGT/HLA FTP Directory hosted at the European Bioinformatics Institute", + "url": "https://github.com/ANHIG/IMGTHLA", + "tracked_assets": { + "description": "Changes to these files trigger processing", + "values": [ + "hla.dat", + "msf/" + ] + }, + "target_metadata_config": { + "description": "Regex patterns to extract the release version metadata from the text content of the asset", + "items": [ + { + "description": "Extract release version metadata for commits from 3a71348 to current", + "asset_path": "alignments/V_nuc.txt", + "metadata_regex": "[1-9]\\d{0,1}\\.[0-9]\\d{0,2}\\.[0-9](?:\\.\\d{1,2})?(?=\\s|$)" + }, + { + "description": "Extract release version metadata for commits from 8632b0d to 3645f26", + "asset_path": "aligments/V_nuc.txt", + "metadata_regex": "[1-9]\\d{0,1}\\.[0-9]\\d{0,2}\\.[0-9](?:\\.\\d{1,2})?(?=\\s|$)" + }, + { + "description": "Extract release version metadata for commits from af54d28 to 9d8f585", + "asset_path": "Alignments/V_nuc.txt", + "metadata_regex": "[1-9]\\d{0,1}\\.[0-9]\\d{0,2}\\.[0-9](?:\\.\\d{1,2})?(?=\\s|$)" + }, + { + "description": "Extract release version metadata for all commits including and before 08e0ef9", + "asset_path": "V_nuc.txt", + "metadata_regex": "[1-9]\\d{0,1}\\.[0-9]\\d{0,2}\\.[0-9](?:\\.\\d{1,2})?(?=\\s|$)" + } + ] + }, + "excluded_commit_shas": { + "description": "Commits to exclude from processing", + "values": [ + "08e0ef9f5c6aade40df681821a0b9caef439fe3a", + "6ad21b61dee3689c5ae68370d635c5ede483c851", + "79d13ceb388eb9dacc9e166be18cce9373f7fd1d", + "9f35f8fe8a2e25bb076e588e65389cac16a8ed2f", + "785c913f2d42abd68bcdf630ce2f58ee9b9c2579", + "efc06e88b56d1e6e44661ec45f192dc1186a30ad" + ] + }, + "default_input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": -1, + "use_existing_build": false, + "skip_load": false + } + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/check_source_update/__init__.py b/gfe-db/pipeline/functions/check_source_update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/check_source_update/app.py b/gfe-db/pipeline/functions/check_source_update/app.py new file mode 100644 index 00000000..4aff45ef --- /dev/null +++ b/gfe-db/pipeline/functions/check_source_update/app.py @@ -0,0 +1,504 @@ +""" +Checks a GitHub repository against app state for new commits and triggers data ingestion. This function processes +only the releases that it finds. To process specific releases, use a different method. + +The execution state table is used to track the state of the application. It uses a composite key of +- commit__sha (hash or primary key) +- execution__version (range or sort key) + +Only the most recent commit for the release is processed. Processing takes place if these conditions are met: +1. There is a new commit sha for a release +2. One of the tracked files (assets) has been changed. (Tracked files contain updates to the data, eg. `.dat`) + +""" +import os +if __name__ != "app": + import sys + sys.path.append(os.environ["GFEDBMODELS_PATH"]) +import logging +from decimal import Decimal +from datetime import datetime, timedelta +import time +import json +from pygethub import list_branches, GitHubPaginator +from gfedbmodels.constants import session, pipeline +from gfedbmodels.types import ( + version_is_valid, + str_to_datetime, + str_from_datetime, + InputParameters, + ExecutionStatus, + ExecutionStateItem, + RepositoryConfig, + Commit, + ExecutionDetailsConfig, + ExecutionPayloadItem, + ExecutionState +) +from gfedbmodels.utils import ( + get_utc_now, + restore_nested_json, + list_commits, + flatten_json, + filter_null_fields, + select_keys, + get_commit +) +from gfedbmodels.ingest import ( + read_source_config, + get_release_version_for_commit +) +from constants import ( + PIPELINE_SOURCE_CONFIG_S3_PATH, + GITHUB_REPOSITORY_OWNER, + GITHUB_REPOSITORY_NAME, + execution_state_table_name, + data_bucket_name, + gfedb_processing_queue_url, + execution_state_table_fields, +) + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +STAGE = os.environ["STAGE"] +APP_NAME = os.environ["APP_NAME"] +GITHUB_PERSONAL_ACCESS_TOKEN = pipeline.secrets.GitHubPersonalAccessToken + +s3 = session.client("s3") +dynamodb = session.resource("dynamodb") +queue = session.resource("sqs") +gfedb_processing_queue = queue.Queue(gfedb_processing_queue_url) + +logger.info( + f"Fetching source config from {data_bucket_name}/{PIPELINE_SOURCE_CONFIG_S3_PATH}" +) + +# Get data source configuration +source_config = read_source_config( + s3_client=s3, + bucket=data_bucket_name, + key=PIPELINE_SOURCE_CONFIG_S3_PATH +) + +source_repo_config = source_config.repositories[f"{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}"] +default_input_parameters = source_repo_config.default_input_parameters + + +# TODO validate commits against tracked source files requiring ingestion +def lambda_handler(event, context): + + utc_now = get_utc_now() + invocation_id = context.aws_request_id + + logger.info(f"Invocation Id: {invocation_id}") + logger.info(json.dumps(event)) + + is_user_event = True if "releases" in event else False + + try: + ### Sync App State with Repo State ### + + logger.info(f"Fetching execution state from {execution_state_table_name}") + table = dynamodb.Table(execution_state_table_name) + + # Get all items from app state table + execution_state = get_execution_state(table) + execution_state_items = execution_state.items + + # 2) Get the repository state from the GitHub API + paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN) + branch_pages = paginator.get_paginator( + list_branches, + owner=GITHUB_REPOSITORY_OWNER, + repo=GITHUB_REPOSITORY_NAME, + user_agent="nmdp-bioinformatics-gfe-db-update-pipeline/1.0", + ) + all_branches = list(branch_pages) + + repo_state = build_execution_state(all_branches, utc_now) + repo_state_items = repo_state.items + + # 3) Compare the app state with the repo state to find new commits + + # Extract commit sha, release version into tuples from both the app and repo states for set operations + app_state_commits = set([(item.commit.sha, item.execution.version) for item in execution_state_items]) + repo_state_commits = set([(item.commit.sha, item.execution.version) for item in repo_state_items]) + + # get the difference between the two states + new_items = [] + if app_state_commits != repo_state_commits: + new_app_state_commits = repo_state_commits - app_state_commits + + # update the outdated records in app state with the new records from repo state + logger.info(f"Updating execution state with new commits: {new_app_state_commits}") + + # get the new records from the repo state + new_items.extend([item for item in repo_state_items if (item.commit.sha, item.execution.version) in new_app_state_commits]) + + # insert the new records into the remote app state + items = format_execution_state_items(new_items) + for item in items: + table.put_item(Item=item) + + # insert the new records into the local app state + execution_state_items.extend(new_items) + + synced_execution_state_items = sorted( + execution_state_items, key=lambda x: x.commit.date_utc, reverse=False + ) + + except Exception as e: + import traceback + message = f"Error syncing app state: {e}\n{traceback.format_exc()}\n{json.dumps(event)}" + logger.error(message) + raise Exception(message) + + ### Process New and User Requested Release Versions ### + unprocessed_execution_state_items_with_params = [] + unprocessed_commits = set() + + # Parse event for user input + user_items = [] + user_input_parameters = None + if is_user_event: + + # Get the state items for each release given by the user + user_releases = [int(release) for release in event["releases"].split(",")] + user_items = list(filter( + lambda item: item.execution.version in user_releases, + synced_execution_state_items + )) + + user_input_parameters = InputParameters(**event) + + # Remove duplicate releases before combining new and user items + if bool(new_items and user_items): + user_commits = set([(item.commit.sha, item.execution.version) for item in user_items]) + + # Remove duplicate release versions + new_item_commits = list(set(new_app_state_commits) - set(user_commits)) + new_items = [ item for item in new_items if (item.commit.sha, item.execution.version) in new_item_commits ] + + unprocessed_execution_state_items_with_params.extend( + [ (default_input_parameters, item) for item in new_items + user_items ] + ) + + # Combine the new items and user items paired with respective input parameters + if bool(new_items and not user_items): + unprocessed_execution_state_items_with_params.extend( + [ (default_input_parameters, item) for item in new_items ] + ) + if bool(not new_items and user_items): + unprocessed_execution_state_items_with_params.extend( + [ (user_input_parameters, item) for item in user_items ] + ) + + # Return if there are no new or user requested items + if not unprocessed_execution_state_items_with_params: + message = "No new commits found" + logger.info(message) + return { + "statusCode": 200, + "body": json.dumps({"message": message}), + } + + logger.info("Updating execution state for pending release versions") + + try: + # Update the status of unprocessed items to PENDING + new_execution_state = [ + update_execution_state_item( + execution_state_item=item, + invocation_id=invocation_id, + status=ExecutionStatus.PENDING, + timestamp=utc_now, + input_parameters=input_params, + version=item.execution.version + ) + for input_params, item in unprocessed_execution_state_items_with_params + ] + + # validate that at least one commit is pending, otherwise raise an error + if not any([item.execution.status == ExecutionStatus.PENDING for item in new_execution_state]): + message = "Commits were found but none are marked PENDING." + logger.error(message) + raise Exception(message) + + # 2) Preprocess the records for the state table (DynamoDB payload) + items = format_execution_state_items(new_execution_state) + + logger.info(f'Adding items to state table: {json.dumps(items, indent=2)}') + + # 3) Load new commit records to the state table + if len(items) > 0: + # Sort by execution__version key + items = sorted(items, key=lambda x: x["execution__version"], reverse=False) + + with table.batch_writer() as batch: + logger.info( + f"Loading {len(items)} items to {execution_state_table_name}" + ) + for item in items: + batch.put_item(Item=item) + logger.info(f"{len(items)} items loaded to {execution_state_table_name}") + else: + raise Exception("Commits were found but the DynamoDB payload is empty") + + # 4) Send pending commits to the state machine for build and load + execution_payload = [ + ExecutionPayloadItem.from_execution_state_item(item).model_dump() + for item in new_execution_state + ] + execution_payload = sorted( + execution_payload, key=lambda x: x["version"], reverse=False + ) + + # Send the payload to the processing queue for the state machine + for item in execution_payload: + # add group ID and message deduplication ID to the message + gfedb_processing_queue.send_message( + MessageGroupId=f'{STAGE}-{APP_NAME}', + MessageDeduplicationId=str(item["version"]), + MessageBody=json.dumps(item) + ) + + # wait n seconds so that the messages are processed in order + time.sleep(5) + + message = f"Queued {len(execution_payload)} release(s) for processing" + logger.info(message) + return { + "statusCode": 200, + "body": json.dumps({ + "message": message, + "payload": execution_payload + }), + } + + except Exception as e: + import traceback + message = f"Error processing releases: {e}\n{traceback.format_exc()}\n{json.dumps(event)}" + logger.error(message) + raise Exception(message) + + +def generate_execution_id(sha: str, timestamp: str, version: int = None) -> str: + """Generate an execution id for the state machine execution with format: + {version}_{commit_sha}_{YYYYMMDD_HHMMSS} + + Args: + message (dict): Message from SQS queue + + Returns: + str: Execution id + """ + return "_".join( + [ + str(version), + sha, #execution_state_item.commit.sha, + str_to_datetime(timestamp).strftime("%Y%m%d_%H%M%S"), + ] + ) + +# @cache_pickle +def get_execution_state(table, sort_column="commit__date_utc", reverse_sort=True): + # Retrieve execution state from table + items = table.scan()["Items"] + + if not items: + message = "No execution items found. Please populate the state table." + logger.error(message) + raise Exception(message) + + items = [ + {k: int(v) if isinstance(v, Decimal) else v for k, v in item.items()} + for item in items + ] + items = sorted(items, key=lambda x: x[sort_column], reverse=reverse_sort) + + # TODO Deserialize and repack the items + execution_state_items = [ + ExecutionStateItem(**restore_nested_json(item, split_on="__")) for item in items + ] + + execution_state = ExecutionState( + **{ + "created_utc": get_utc_now(), + "items": execution_state_items, + } + ) + + return execution_state + + +# @cache_json +# TODO return Commit class to make sure data is correct +def get_most_recent_commits(execution_state): + # 1) Get the most recent commit date from DynamoDB using max(), add one second to it so the same commit is not returned (because of timestamp overlap) + last_commit_date = max( + [str_to_datetime(item.commit.date_utc) for item in execution_state] + ) + + # add minor offset to avoid duplicate commits + since = str_from_datetime(last_commit_date + timedelta(seconds=1)) + + # 2) Get the most recent commits from GitHub using since= parameter + return list_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, since=since, token=GITHUB_PERSONAL_ACCESS_TOKEN) + + +def select_most_recent_commit_for_release(commits: list[ExecutionStateItem], select_release_versions: list[int] = None) -> list[ExecutionStateItem]: + + # Parameterize for user input (chosen releases new or old) vs scheduled event (all new releases) + if select_release_versions: + release_versions = list(set([item.execution.version for item in commits if item.execution.version in select_release_versions])) + else: + # group by release version and get most recent by commit date (max date_utc) + release_versions = list(set([item.execution.version for item in commits])) + + return [ + { + version: max( + [item for item in commits if item.execution.version == version], + key=lambda x: x.commit.date_utc, + ) + } + for version in release_versions + ] + +def update_execution_state_item( + execution_state_item: ExecutionStateItem, + invocation_id: str, + status: str, + timestamp: str, + input_parameters: dict = None, + version: int = None +) -> ExecutionStateItem: + + execution_state_item.execution.invocation_id = invocation_id + execution_state_item.execution.status = status + execution_state_item.updated_utc = timestamp + + if input_parameters is not None and status == ExecutionStatus.PENDING: + execution_state_item.execution.id = generate_execution_id( + sha=execution_state_item.commit.sha, + timestamp=execution_state_item.updated_utc, + version=version + ) + execution_state_item.execution.input_parameters = input_parameters + # TODO Update format to s3:///data/csv/' for csv and s3:///data/dat/' for hla.dat for Glue Catalog + execution_state_item.execution.s3_path = ( + f"s3://{data_bucket_name}/data/{execution_state_item.execution.version}" + ) + + # Reset error if present from previous executions + if execution_state_item.error is not None or execution_state_item.execution.date_utc is not None: + execution_state_item.error = None + execution_state_item.execution.date_utc = None + + + return execution_state_item + +def format_execution_state_items(new_execution_state: list[ExecutionStateItem]) -> list[dict]: + return [ + filter_null_fields( + flatten_json( + data=item.model_dump(), + sep="__", + select_fields=[ + item.replace(".", "__") for item in execution_state_table_fields + ], + ) + ) + for item in new_execution_state + ] + + +def get_branch_commits(branches: list[dict]) -> list[ExecutionStateItem]: + + # For each entry in all-branches, get the commit data and build the execution state item + execution_state_items = [] + + for item in branches: + + if not version_is_valid(item["name"], return_bool=True): + continue + + release_version = item["name"] + sha = item["commit"]["sha"] + + logger.info(f"Retrieving data for {sha}") + commit_json = get_commit( + GITHUB_REPOSITORY_OWNER, + GITHUB_REPOSITORY_NAME, + GITHUB_PERSONAL_ACCESS_TOKEN, + sha, + ) + assert sha == commit_json["sha"] + + commit = Commit( + sha=commit_json["sha"], + date_utc=commit_json["commit"]["author"]["date"], + message=commit_json["commit"]["message"], + html_url=commit_json["html_url"], + ) + + execution_state_item = ExecutionStateItem( + created_utc=get_utc_now(), + updated_utc=get_utc_now(), + commit=commit, + execution=ExecutionDetailsConfig( + version=release_version, + status="NOT_PROCESSED", + date_utc=None, + input_parameters=None, + ), + # error=None, + # s3_path=None, + repository=RepositoryConfig( + **select_keys( + source_config.repositories[ + GITHUB_REPOSITORY_OWNER + "/" + GITHUB_REPOSITORY_NAME + ].model_dump(), + ["owner", "name", "url"], + ) + ), + ) + execution_state_items.append(execution_state_item) + + return execution_state_items + + + +def build_execution_state(branches, utc_now=None): + + utc_now = utc_now or get_utc_now() + + # Create ExecutionStateItems array from branch/commit sha pairs + execution_state_items = get_branch_commits(branches) + + # Sort execution state items by date descending + execution_state_items = sorted( + execution_state_items, key=lambda x: x.commit.date_utc, reverse=True + ) + + # Package records as ExecutionState object to seed table + execution_state = ExecutionState( + **{ + "created_utc": utc_now, + "items": execution_state_items, + } + ) + + return execution_state + +if __name__ == "__main__": + from pathlib import Path + + # event = json.loads((Path(__file__).parent / "schedule-event.json").read_text()) + event = json.loads((Path(__file__).parent / "user-event.json").read_text()) + + class MockContext: + aws_request_id = "1234" + + lambda_handler(event, MockContext()) diff --git a/gfe-db/pipeline/functions/check_source_update/constants.py b/gfe-db/pipeline/functions/check_source_update/constants.py new file mode 100644 index 00000000..31b2870e --- /dev/null +++ b/gfe-db/pipeline/functions/check_source_update/constants.py @@ -0,0 +1,23 @@ +import os +from gfedbmodels.constants import ( + session, + infra, + pipeline +) + +# Environment +PIPELINE_SOURCE_CONFIG_S3_PATH = os.environ["PIPELINE_SOURCE_CONFIG_S3_PATH"] +data_bucket_name = infra.params.DataBucketName +( + GITHUB_REPOSITORY_OWNER, + GITHUB_REPOSITORY_NAME, + execution_state_table_name, + gfedb_processing_queue_url, + execution_state_table_fields +) = ( + pipeline.params.GitHubSourceRepository["owner"], + pipeline.params.GitHubSourceRepository["name"], + pipeline.params.GfeDbExecutionStateTableName, + pipeline.params.GfeDbProcessingQueueUrl, + pipeline.params.GfeDbExecutionStateTableFields +) diff --git a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-true.json b/gfe-db/pipeline/functions/check_source_update/error-event.json similarity index 68% rename from gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-true.json rename to gfe-db/pipeline/functions/check_source_update/error-event.json index 144687ca..90055834 100644 --- a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-true.json +++ b/gfe-db/pipeline/functions/check_source_update/error-event.json @@ -1,8 +1,8 @@ { "align": false, "kir": false, - "limit": "", - "releases": "310", + "limit": "1000", + "releases": "3530,3540,3550", "mem_profile": false, "use_existing_build": true, "skip_load": false diff --git a/gfe-db/pipeline/functions/check_source_update/requirements.txt b/gfe-db/pipeline/functions/check_source_update/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_pipeline/schedule-event.json b/gfe-db/pipeline/functions/check_source_update/schedule-event.json similarity index 56% rename from gfe-db/pipeline/functions/invoke_pipeline/schedule-event.json rename to gfe-db/pipeline/functions/check_source_update/schedule-event.json index b356dfbd..51b28935 100644 --- a/gfe-db/pipeline/functions/invoke_pipeline/schedule-event.json +++ b/gfe-db/pipeline/functions/check_source_update/schedule-event.json @@ -1,13 +1,13 @@ { "version": "0", - "id": "e58ee31d-5d5e-7a98-f497-aa0a04e81cb1", + "id": "7bbfa494-9a2d-630b-1f09-aa1079f18fc6", "detail-type": "Scheduled Event", "source": "aws.events", - "account": "", - "time": "2023-07-30T12:00:00Z", + "account": ":rule/dev-gfe-db-pipeline-InvokePipelineFunctionTrigger-A2S8FGQJMH5D" + "arn:aws:events:us-east-1::rule/dev-gfe-db-pipeline-CheckSourceUpdateFunctionTrigg-1NIOIDD25B8MJ" ], "detail": {} } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/check_source_update/user-event.json b/gfe-db/pipeline/functions/check_source_update/user-event.json new file mode 100644 index 00000000..b1aed161 --- /dev/null +++ b/gfe-db/pipeline/functions/check_source_update/user-event.json @@ -0,0 +1,8 @@ +{ + "align": false, + "kir": false, + "limit": 1000, + "releases": "3540,3550", + "mem_profile": false, + "use_existing_build": true +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/environment.json b/gfe-db/pipeline/functions/environment.json deleted file mode 100644 index 8ad13c9c..00000000 --- a/gfe-db/pipeline/functions/environment.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "Functions": { - "InvokePipeline": { - "InvokePipelineFunctionSchedule": "cron(0\\ 12\\ *\\ *\\ ?\\ *)", - "FunctionConfiguration": { - "FunctionName": "invoke-pipeline", - "MemorySize": 256, - "Timeout": 60, - "Environment": { - "Variables": { - "PIPELINE_STATE_PATH": "config/pipeline/IMGTHLA-repository-state.json", - "PIPELINE_PARAMS_PATH": "config/pipeline/pipeline-input.json" - } - } - } - }, - "ExecuteValidationQueries": { - "FunctionConfiguration": { - "FunctionName": "execute-validation-queries", - "MemorySize": 256, - "Timeout": 60 - } - }, - "InvokeBackupScript": { - "FunctionConfiguration": { - "FunctionName": "invoke-backup-script" - } - }, - "InvokeLoadScript": { - "FunctionConfiguration": { - "FunctionName": "invoke-load-script" - } - }, - "ValidateBuildOutput": { - "FunctionConfiguration": { - "FunctionName": "validate-build-output" - } - }, - "DisableBackup": { - "FunctionConfiguration": { - "FunctionName": "disable-backup" - } - } - } -} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/evaluate_query_results/app.py b/gfe-db/pipeline/functions/evaluate_query_results/app.py new file mode 100644 index 00000000..33bcbafc --- /dev/null +++ b/gfe-db/pipeline/functions/evaluate_query_results/app.py @@ -0,0 +1,121 @@ +""" +Query result evaluation: +For post-load invocations the script appends additional metadata to the payload to indicate whether the load was successful. +The success condition is defined by the following criteria: +- The number of nodes in the database is greater than the number of nodes in the pre-load invocation. +- The post-load release version for the State Machine execution matches the release version in the database. +- The post-load number of unique release versions is greater than the number of unique release versions in the pre-load invocation. +- If a limit is specified, the number of GFE nodes for the specific release matches the limit +""" +import os +import logging +import json + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +AWS_REGION = os.environ["AWS_REGION"] +STAGE = os.environ["STAGE"] +APP_NAME = os.environ["APP_NAME"] + +def lambda_handler(event, context): + + try: + release_version = event['input']['version'] + query_results = event['validations']['queries'] + + release_version = event['input']['version'] + + # Initialize errors array + errors = [] + + # Release has been added to the database + unique_releases_in_db_pre_load = sorted([ item['release_version'] for item in query_results['pre']['has_ipd_allele_release_counts'] ]) + unique_releases_in_db_post_load = sorted([ item['release_version'] for item in query_results['post']['has_ipd_allele_release_counts'] ]) + # is_release_version_loaded = set(unique_releases_in_db_post_load) - set(unique_releases_in_db_pre_load) == set([int(release_version)]) or \ + + is_release_version_already_loaded = release_version in unique_releases_in_db_pre_load + is_release_version_loaded = release_version in unique_releases_in_db_post_load + if not is_release_version_loaded: + errors.append("Release version not loaded") + + # Number of nodes in the database has increased + node_counts_pre_load = sum(sorted([ item['count'] for item in query_results['pre']['node_counts'] ])) + node_counts_post_load = sum(sorted([ item['count'] for item in query_results['post']['node_counts'] ])) + have_node_counts_increased = node_counts_post_load > node_counts_pre_load + if not have_node_counts_increased and not is_release_version_already_loaded: + errors.append("Node count has not increased") + + # # Number of unique release versions in the database has increased by one + # num_unique_releases_in_db_post_load = len(unique_releases_in_db_post_load) + # num_unique_releases_in_db_pre_load = len(unique_releases_in_db_pre_load) + # has_unique_release_count_increased_by_1 = num_unique_releases_in_db_post_load == num_unique_releases_in_db_pre_load + 1 + # if not has_unique_release_count_increased_by_1 and not is_release_version_already_loaded: + # errors.append("Unique release count has not increased by 1") + + # TODO 1/19/24 - Allow for the same release version to be loaded multiple times without failing the load validation + if is_release_version_already_loaded: + is_load_successful = ( + is_release_version_loaded + ) + else: + is_load_successful = ( + is_release_version_loaded + and have_node_counts_increased + # and has_unique_release_count_increased_by_1 + ) + + payload = { + "is_load_successful": { + "value": is_load_successful, + "details": { + "unique_releases_in_db_pre_load": unique_releases_in_db_pre_load, + "unique_releases_in_db_post_load": unique_releases_in_db_post_load + } + }, + } + + # TODO Validate based on node metadata if is_release_version_already_loaded is True + # These validations will cause a failure if the release version has already been loaded + if not is_release_version_already_loaded: + + payload["have_node_counts_increased"] = { + "value": have_node_counts_increased, + "details": { + "node_counts_pre_load": node_counts_pre_load, + "node_counts_post_load": node_counts_post_load + }, + } + + # payload["has_unique_release_count_increased_by_1"] = { + # "value": has_unique_release_count_increased_by_1, + # "details": { + # "num_unique_releases_in_db_pre_load": num_unique_releases_in_db_pre_load, + # "num_unique_releases_in_db_post_load": num_unique_releases_in_db_post_load + # } + # } + + if errors: + payload["errors"] = errors + + logger.info(json.dumps(payload)) + + return payload + + except Exception as e: + import traceback + message = f"Error evaluating query results: {e}\n{traceback.format_exc()}" + logger.error(message) + logger.error(f"Event: {json.dumps(event)}") + raise Exception(message) + +if __name__ == "__main__": + from pathlib import Path + + event_path = Path(__file__).parent / "event.json" + event_path = Path(__file__).parent / "3520.json" + + with open(event_path, "r") as file: + event = json.load(file) + + lambda_handler(event,"") diff --git a/gfe-db/pipeline/functions/evaluate_query_results/event.json b/gfe-db/pipeline/functions/evaluate_query_results/event.json new file mode 100644 index 00000000..489e7d3f --- /dev/null +++ b/gfe-db/pipeline/functions/evaluate_query_results/event.json @@ -0,0 +1,311 @@ +{ + "input": { + "id": "3540_522c1fdd2b79fa7caa628ec48f6654342cb77045_20240120_013435", + "version": 3540, + "commit_sha": "522c1fdd2b79fa7caa628ec48f6654342cb77045", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540", + "receipt_handle": "AQEBE3msVR2U3ipnQVP9PmkEAuAG1QEsnWos4CO639jAxiNLmiXlE6nenceToY6xq56AK/Cy10tnKeBADWfKcEoXd8Fj3PvRRunazOvO9RjJSLqKIcz+y5F9BnHqM3UzhjGaJw9/siOa6BugbXKKjuChejwmxh3FEKO7Ot6J7xGaZF4xewyimL0YLi9VWprzynoX9Lva5J5Ej3ozOf9856uEckFAwzV1Isg6qVWcgsPzt6aToefnwBxZbvsEz0o9MAjtbfEcAoJWdiJUAdXzF+W/uqipuh+u6ZzedHByVMK+SiDSGJ+nED01A1FIrRUQJsHsEMvEJUNqF80YknVru4CS5INTRP/sm0N6KBy5ozQweSHTOaZ+0ECPS1Y/StmtS/WpxdJ2yrXcCxGEYPVChSBhOA7BrpD5DOE9VJAGYip10vCDtJ27JOWK4naNlQz2JZ+L" + }, + "validations": { + "queries": { + "pre": { + "node_counts": [ + { + "node": "GFE", + "count": 1009 + }, + { + "node": "IPD_Accession", + "count": 1009 + }, + { + "node": "IPD_Allele", + "count": 1009 + }, + { + "node": "Sequence", + "count": 1009 + }, + { + "node": "Feature", + "count": 1102 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3540, + "count": 1000 + }, + { + "release_version": 3550, + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.54.0", + "count": 1000 + }, + { + "release_version": "3.55.0", + "count": 9 + } + ] + }, + "post": { + "node_counts": [ + { + "node": "GFE", + "count": 1009 + }, + { + "node": "IPD_Accession", + "count": 1009 + }, + { + "node": "IPD_Allele", + "count": 1009 + }, + { + "node": "Sequence", + "count": 1009 + }, + { + "node": "Feature", + "count": 1102 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3540, + "count": 1000 + }, + { + "release_version": 3550, + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.54.0", + "count": 1000 + }, + { + "release_version": "3.55.0", + "count": 9 + } + ] + } + }, + "check_existing_build": { + "Contents": [ + { + "ETag": "\"007c5e4db9431e79c4c0db0133266c78\"", + "Key": "data/3540/csv/all_cds.3540.csv", + "LastModified": "2024-01-19T21:16:16Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 1396846, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"2f3d5d86377b8cd456bae94cacb333c8\"", + "Key": "data/3540/csv/all_features.3540.csv", + "LastModified": "2024-01-19T21:16:16Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 3796840, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"19939e8c73832e069626da9bbbdc200f\"", + "Key": "data/3540/csv/all_groups.3540.csv", + "LastModified": "2024-01-19T21:16:16Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 293153, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"a2cc1bbe2838160ad417c0ba34557007\"", + "Key": "data/3540/csv/gfe_sequences.3540.csv", + "LastModified": "2024-01-19T21:16:16Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 2574586, + "StorageClass": "STANDARD" + } + ], + "IsTruncated": false, + "Marker": "", + "MaxKeys": 1000, + "Name": "nmdpf-gfe-db-810526023897-us-east-1", + "Prefix": "data/3540/csv/" + }, + "build_outputs": { + "release_version": 3540, + "details": [ + { + "schema": "all_cds", + "release": 3540, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540/csv/all_cds.3540.csv", + "cols": [ + "gfe_name", + "bp_seq_id", + "bp_sequence", + "aa_seq_id", + "aa_sequence" + ], + "num_rows": 1000, + "created_utc": "2024-01-20T01:42:12.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_features", + "release": 3540, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540/csv/all_features.3540.csv", + "cols": [ + "accession", + "hash_code", + "locus", + "rank", + "sequence", + "term", + "gfe_name", + "allele_id", + "hla_name", + "imgt_release" + ], + "num_rows": 12924, + "created_utc": "2024-01-20T01:42:12.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_groups", + "release": 3540, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540/csv/all_groups.3540.csv", + "cols": [ + "gfe_name", + "allele_id", + "hla_name", + "ard_id", + "ard_name", + "locus", + "imgt_release" + ], + "num_rows": 3000, + "created_utc": "2024-01-20T01:42:12.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "gfe_sequences", + "release": 3540, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540/csv/gfe_sequences.3540.csv", + "cols": [ + "gfe_name", + "acc_name", + "locus", + "hla_name", + "seq_id", + "sequence", + "length", + "imgt_release" + ], + "num_rows": 1000, + "created_utc": "2024-01-20T01:42:12.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + } + ], + "errors": [], + "expected_artifacts": [ + "all_cds.3540.csv", + "all_features.3540.csv", + "all_groups.3540.csv", + "gfe_sequences.3540.csv" + ], + "is_valid_build": true + } + }, + "backups": { + "pre": { + "document_name": "nmdpf-gfe-db-database-Neo4jBackupDocument-7MYPVO5AczNY", + "command_id": "eca3d8d7-bf5e-4fd9-8980-fa29480e87ab" + } + }, + "state": { + "created_utc": "2024-01-20T01:19:04.822Z", + "updated_utc": "2024-01-20T01:34:35.436Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "522c1fdd2b79fa7caa628ec48f6654342cb77045", + "date_utc": "2024-01-11T14:54:13.000Z", + "message": "Merge pull request #355 from ANHIG/3540\n\nRelease of new hla.xml format as hla_new.xml and hla_new.xsd for 3.54.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/522c1fdd2b79fa7caa628ec48f6654342cb77045" + }, + "execution": { + "id": "3540_522c1fdd2b79fa7caa628ec48f6654342cb77045_20240120_013435", + "version": 3540, + "status": "LOAD_IN_PROGRESS", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + } + } + } + } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/evaluate_query_results/requirements.txt b/gfe-db/pipeline/functions/evaluate_query_results/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/execute_validation_queries/app.py b/gfe-db/pipeline/functions/execute_validation_queries/app.py index 7b3ce7ad..7c9b3113 100644 --- a/gfe-db/pipeline/functions/execute_validation_queries/app.py +++ b/gfe-db/pipeline/functions/execute_validation_queries/app.py @@ -1,7 +1,5 @@ """ -This function executes validation queries against the Neo4j database and returns the results. -If USE_PRIVATE_SUBNET is true, this function will run inside a VPC and private subnet. -If USE_PRIVATE_SUBNET is false, this function will run outside a VPC and in a public subnet. +This function executes pre-load and post-load validation queries against the Neo4j database and returns the results. """ import os import logging @@ -37,6 +35,9 @@ def lambda_handler(event, context): logger.info(json.dumps(event)) + # # TODO TESTING STATE MACHINE ERROR HANDLING + # raise Exception(f"Test Error from {context.function_name}") + with graphdb as driver: # node counts @@ -50,6 +51,7 @@ def lambda_handler(event, context): # HAS_IPD_ALLELE relationship releases property release counts has_ipd_allele_release_counts = execute_query(driver, has_ipd_allele_release_counts_cql) + has_ipd_allele_release_counts = sorted(has_ipd_allele_release_counts, key=lambda k: k['release_version']) # IPD_Accession node release counts ipd_accession_release_counts = execute_query(driver, ipd_accession_release_counts_cql) @@ -61,6 +63,19 @@ def lambda_handler(event, context): } return payload + # # TODO if event contains "$.validations.queries.pre", confirm that the pre and + # # post query results indicate the load was successful + # # `is_load_successful = True/False ==> return {"is_load_successful": is_load_successful}` + # # TODO calculate expected counts based on CSV files (validate build output) and compare + # if "validations" in event: + # if "queries" in event["validations"]: + # if "pre" in event["validations"]["queries"]: + + # # TODO temporary return value, still need to compare pre and post query results + # payload["is_load_successful"] = True + + # return payload + nodes = [ "GFE", "IPD_Accession", @@ -100,7 +115,8 @@ def execute_query(driver, query, database="gfedb"): if __name__ == "__main__": from pathlib import Path - event_path = Path(__file__).parent / "post-execution-event.json" + event_path = Path(__file__).parent / "pre-execution-event.json" + # event_path = Path(__file__).parent / "post-execution-event.json" with open(event_path, "r") as file: event = json.load(file) diff --git a/gfe-db/pipeline/functions/execute_validation_queries/post-execution-event.json b/gfe-db/pipeline/functions/execute_validation_queries/post-execution-event.json index a87ac757..9adc897b 100644 --- a/gfe-db/pipeline/functions/execute_validation_queries/post-execution-event.json +++ b/gfe-db/pipeline/functions/execute_validation_queries/post-execution-event.json @@ -1,36 +1,41 @@ { - "input": [ - { - "ALIGN": "False", - "KIR": "False", - "LIMIT": "1000", - "MEM_PROFILE": "False", - "RELEASES": "3450" - } - ], + "input": { + "version": 3500, + "commit_sha": "50b790037030d958b662085c3f4cf34ba72a32ec", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": true, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500", + "receipt_handle": "AQEBEtdJKEOIhBCO24mxSnC8Q7RP8bFbXbsUuwd9Ix3vgj2QwRnLOPtvIaBlKaZQFaUU9qj9VitZBZXv0wTHkleavmWo7WnhtfsKFyCmB8I/1wAise2Lzo0y6GisRyKSDMJd+tmtZUgX2tfPEQ8cx4N6hHvlWXW2XhoEH0h5fWtJj2pCKUcnqdJQcinSVoLrfGZpZpYnZ+fhiDqcnBsP3lv6XcUY+2MynHK1QkadxhM2JXipAj5qb7lVS0lCSiWfTaHcph1RExmO4k/QNZBJGr41JVmqPONqO9Dp8ryyXIpavPLvP/uK6OAkppXbQVM5VqNEEx7Njqd2e9C2NoYXKCrGToigN62rEOWzL72L/rFHHXMGXhiCJnd7J67g8Ni5ejDI6t6zkbMlvdLTnegLtVt91MwW4+Hfpwr2dtZJwpsu0zPa0qYF6CaliYanA8CJVzLI" + }, "validations": { "queries": { "pre": { "node_counts": [ { "node": "GFE", - "count": 1137 + "count": 1045 }, { "node": "IPD_Accession", - "count": 1124 + "count": 1042 }, { "node": "IPD_Allele", - "count": 1126 + "count": 1042 }, { "node": "Sequence", - "count": 1137 + "count": 1045 }, { "node": "Feature", - "count": 1220 + "count": 1134 }, { "node": "Submitter", @@ -39,196 +44,235 @@ ], "has_ipd_allele_release_counts": [ { - "release_version": "3450", - "count": 1000 - }, - { - "release_version": "3460", - "count": 1000 - }, - { - "release_version": "3470", - "count": 1000 - }, - { - "release_version": "3480", + "release_version": "3500", "count": 1000 }, { - "release_version": "3490", + "release_version": "3510", "count": 1000 }, { - "release_version": "3510", + "release_version": "3530", "count": 1000 }, { - "release_version": "3520", + "release_version": "3540", "count": 1000 } ], "ipd_accession_release_counts": [ { - "release_version": "3.51.0", + "release_version": "3.54.0", "count": 1000 }, { - "release_version": "3.52.0", - "count": 8 + "release_version": "3.53.0", + "count": 19 }, { - "release_version": "3.49.0", - "count": 14 - }, - { - "release_version": "3.48.0", - "count": 16 - }, - { - "release_version": "3.47.0", - "count": 12 - }, - { - "release_version": "3.46.0", - "count": 52 + "release_version": "3.51.0", + "count": 20 }, { - "release_version": "3.45.0", - "count": 35 + "release_version": "3.50.0", + "count": 6 } ] } }, - "build": { - "validated": [ + "check_existing_build": { + "Contents": [ { - "ALIGN": "False", - "KIR": "False", - "LIMIT": "1000", - "MEM_PROFILE": "False", - "RELEASES": "3450" + "ETag": "\"c341cc4f442f15a110757be60fb3331d\"", + "Key": "data/3500/csv/all_cds.3500.csv", + "LastModified": "2024-01-02T22:49:02Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 1372160, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"6ebdda54dc563f5ea743903c269e3a45\"", + "Key": "data/3500/csv/all_features.3500.csv", + "LastModified": "2024-01-02T22:49:02Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 3646635, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"5e682da8b24df44dc4a72b062789da24\"", + "Key": "data/3500/csv/all_groups.3500.csv", + "LastModified": "2024-01-02T22:49:02Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 292877, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"03ff807494ee1743c9c59141987a1e70\"", + "Key": "data/3500/csv/gfe_sequences.3500.csv", + "LastModified": "2024-01-02T22:49:02Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 2480360, + "StorageClass": "STANDARD" } ], - "build_details": [ + "IsTruncated": false, + "Marker": "", + "MaxKeys": 1000, + "Name": "nmdpf-gfe-db-810526023897-us-east-1", + "Prefix": "data/3500/csv/" + }, + "build_outputs": { + "release_version": 3500, + "details": [ + { + "schema": "all_cds", + "release": 3500, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500/csv/all_cds.3500.csv", + "cols": [ + "gfe_name", + "bp_seq_id", + "bp_sequence", + "aa_seq_id", + "aa_sequence" + ], + "num_rows": 1000, + "created_utc": "2024-01-02T22:49:02.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_features", + "release": 3500, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500/csv/all_features.3500.csv", + "cols": [ + "accession", + "hash_code", + "locus", + "rank", + "sequence", + "term", + "gfe_name", + "allele_id", + "hla_name", + "imgt_release" + ], + "num_rows": 12377, + "created_utc": "2024-01-02T22:49:02.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, { - "release": "3450", - "details": [ - { - "schema": "all_cds", - "release": "3450", - "file_path": "s3://dev-gfe-db-810526023897-us-east-1/data/3450/csv/all_cds.3450.csv", - "cols": [ - "gfe_name", - "bp_seq_id", - "bp_sequence", - "aa_seq_id", - "aa_sequence" - ], - "num_rows": 1000, - "created_utc": "2023-07-31T20:33:03.000Z", - "details": { - "is_valid_csv_timestamp": true, - "is_valid_csv_filename": true, - "is_valid_csv_headers": true, - "is_valid_csv_rows": true - }, - "num_errors": 0, - "is_valid_csv": true - }, - { - "schema": "all_features", - "release": "3450", - "file_path": "s3://dev-gfe-db-810526023897-us-east-1/data/3450/csv/all_features.3450.csv", - "cols": [ - "accession", - "hash_code", - "locus", - "rank", - "sequence", - "term", - "gfe_name", - "allele_id", - "hla_name", - "imgt_release" - ], - "num_rows": 11616, - "created_utc": "2023-07-31T20:33:03.000Z", - "details": { - "is_valid_csv_timestamp": true, - "is_valid_csv_filename": true, - "is_valid_csv_headers": true, - "is_valid_csv_rows": true - }, - "num_errors": 0, - "is_valid_csv": true - }, - { - "schema": "all_groups", - "release": "3450", - "file_path": "s3://dev-gfe-db-810526023897-us-east-1/data/3450/csv/all_groups.3450.csv", - "cols": [ - "gfe_name", - "allele_id", - "hla_name", - "ard_id", - "ard_name", - "locus", - "imgt_release" - ], - "num_rows": 3000, - "created_utc": "2023-07-31T20:33:03.000Z", - "details": { - "is_valid_csv_timestamp": true, - "is_valid_csv_filename": true, - "is_valid_csv_headers": true, - "is_valid_csv_rows": true - }, - "num_errors": 0, - "is_valid_csv": true - }, - { - "schema": "gfe_sequences", - "release": "3450", - "file_path": "s3://dev-gfe-db-810526023897-us-east-1/data/3450/csv/gfe_sequences.3450.csv", - "cols": [ - "gfe_name", - "acc_name", - "locus", - "hla_name", - "seq_id", - "sequence", - "length", - "imgt_release" - ], - "num_rows": 1000, - "created_utc": "2023-07-31T20:33:03.000Z", - "details": { - "is_valid_csv_timestamp": true, - "is_valid_csv_filename": true, - "is_valid_csv_headers": true, - "is_valid_csv_rows": true - }, - "num_errors": 0, - "is_valid_csv": true - } + "schema": "all_groups", + "release": 3500, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500/csv/all_groups.3500.csv", + "cols": [ + "gfe_name", + "allele_id", + "hla_name", + "ard_id", + "ard_name", + "locus", + "imgt_release" ], - "errors": [], - "expected_artifacts": [ - "all_cds.3450.csv", - "all_features.3450.csv", - "all_groups.3450.csv", - "gfe_sequences.3450.csv" + "num_rows": 3000, + "created_utc": "2024-01-02T22:49:02.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "gfe_sequences", + "release": 3500, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500/csv/gfe_sequences.3500.csv", + "cols": [ + "gfe_name", + "acc_name", + "locus", + "hla_name", + "seq_id", + "sequence", + "length", + "imgt_release" ], - "is_valid_build": true + "num_rows": 1000, + "created_utc": "2024-01-02T22:49:02.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true } ], - "has_valid_payload": true + "errors": [], + "expected_artifacts": [ + "all_cds.3500.csv", + "all_features.3500.csv", + "all_groups.3500.csv", + "gfe_sequences.3500.csv" + ], + "is_valid_build": true } }, "backups": { "pre": { - "DocumentName": "dev-gfe-db-database-Neo4jBackupDocument-nAqIhQzAdtbD", - "CommandId": "8769a561-1ca2-4970-8df1-cd178b6032ba" + "document_name": "nmdpf-gfe-db-database-Neo4jBackupDocument-VBAMg82ny9pW", + "command_id": "dc450816-3414-49dd-9cd7-987ce7020993" + } + }, + "state": { + "created_utc": "2024-01-02T18:19:05.170Z", + "updated_utc": "2024-01-02T23:02:14.684Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "50b790037030d958b662085c3f4cf34ba72a32ec", + "date_utc": "2022-12-14T10:02:54.000Z", + "message": "Merge pull request #323 from ANHIG/3500\n\nCorrected missing date and version field in Allelelist_history.txt", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec" + }, + "execution": { + "version": 3500, + "status": "PENDING", + "date_utc": "2024-01-02T23:02:14.684Z", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + } } } } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/format_results/app.py b/gfe-db/pipeline/functions/format_results/app.py new file mode 100644 index 00000000..99e24e03 --- /dev/null +++ b/gfe-db/pipeline/functions/format_results/app.py @@ -0,0 +1,159 @@ +import os +import logging +import json + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +AWS_REGION = os.environ["AWS_REGION"] +STAGE = os.environ["STAGE"] +APP_NAME = os.environ["APP_NAME"] + +# Templates for the report +success_report_template = """ +Data Load Summary: +- New Node Counts Added: {new_node_counts} +- Unique Release Version Added: {unique_release_version} + +Data Files Processed: +{data_files_info} +Data Integrity Checks: +- CSV File Validations: All passed +- Node Counts Increase Check: {node_counts_check} +- Unique Release Count Increment Check: {release_count_check} + +Backup Details: +- Pre-Execution Backup: Command ID {pre_execution_backup} +- Post-Execution Backup: Command ID {post_execution_backup} + +""" + +failure_report_template = """ +Error Details: +- Error: {error} +- Cause: {cause} +""" + +report_template = """ +{title} +------- + +Deployment: {deployment} + +Release Version: {release_version} + +Execution ID: {execution_id} +Execution Status: {execution_status} +Execution Date: {execution_date} UTC + +Commit Details: {commit_sha} - {commit_message} ({commit_url}) +{status_report} +""" + +def lambda_handler(event, context): + + logger.info(json.dumps(event)) + data = event + + # Set the title + title = f"gfe-db Update Pipeline Execution Report" + title_underline = "-" * len(title) + deployment = f"{STAGE}-{APP_NAME}" + + if "Error" in data.keys(): + + # Extract required information from JSON + release_version = data["state"]["execution"]["version"] + execution_id = data["state"]["execution"]["id"] + execution_status = f"🔴 {data['state']['execution']['status']}" + execution_date = data["state"]["updated_utc"] + commit_sha = data["state"]["commit"]["sha"] + commit_url = data["state"]["commit"]["html_url"] + commit_message = data["state"]["commit"]["message"].replace("\n", " ") + + cause = "N/A" if data['Cause'] == "" else data['Cause'] + error = "N/A" if data['Error'] == "" else data['Error'] + + status_report = failure_report_template.format( + cause=cause, + error=error + ) + + else: + + # Extract required information from JSON + release_version = data['input']['version'] + execution_status = f"🟢 {data['state']['execution']['status']}" if data['state']['execution']['status'] == "LOAD_SUCCESS" \ + else f"🔴 {data['state']['execution']['status']}" + execution_id = data['state']['execution']['id'] + execution_date = data['state']['updated_utc'] + commit_sha = data['state']['commit']['sha'] + commit_url = data['state']['commit']['html_url'] + commit_message = data['state']['commit']['message'].replace("\n", " ") + + # These fields are not present when duplicate executions are run (same release, commit sha and/or limit) + if 'have_node_counts_increased' in data['validations']['load_results']: + new_node_counts = data['validations']['load_results']['have_node_counts_increased']['details']['node_counts_post_load'] + node_counts_check = "Passed" if data['validations']['load_results']['have_node_counts_increased']['value'] else "Failed" + else: + new_node_counts = "N/A" + node_counts_check = "N/A" + + if 'has_unique_release_count_increased_by_1' in data['validations']['load_results']: + unique_release_version = data['validations']['load_results']['has_unique_release_count_increased_by_1']['details']['num_unique_releases_in_db_post_load'] + release_count_check = "Passed" if data['validations']['load_results']['has_unique_release_count_increased_by_1']['value'] else "Failed" + else: + unique_release_version = "N/A" + release_count_check = "N/A" + + data_files_info = format_data_files(data['validations']['build_outputs']['details']) + + if "backups" in data.keys(): + pre_execution_backup = data['backups']['pre']['command_id'] + post_execution_backup = data['backups']['post']['command_id'] + else: + pre_execution_backup = "N/A" + post_execution_backup = "N/A" + + status_report = success_report_template.format( + new_node_counts=new_node_counts, + unique_release_version=unique_release_version, + data_files_info=data_files_info, + node_counts_check=node_counts_check, + release_count_check=release_count_check, + pre_execution_backup=pre_execution_backup, + post_execution_backup=post_execution_backup + ) + + report = report_template.format( + title=title, + title_underline=title_underline, + deployment=deployment, + release_version=release_version, + execution_id=execution_id, + execution_status=execution_status, + execution_date=execution_date, + commit_sha=commit_sha, + commit_url=commit_url, + commit_message=commit_message, + status_report=status_report + ) + + return report + + +# Function to process data files information +def format_data_files(data_files): + file_info = "" + for file in data_files: + file_info += f"- `{file['schema']}` - {file['num_rows']} rows\n" + return file_info + +if __name__ == "__main__": + from pathlib import Path + + # event = json.loads((Path(__file__).parent / "failure-event.json").read_text()) + # event = json.loads((Path(__file__).parent / "success-event.json").read_text()) + event = json.loads((Path(__file__).parent / "error.json").read_text()) + + lambda_handler(event,"") diff --git a/gfe-db/pipeline/functions/format_results/error.json b/gfe-db/pipeline/functions/format_results/error.json new file mode 100644 index 00000000..792c2875 --- /dev/null +++ b/gfe-db/pipeline/functions/format_results/error.json @@ -0,0 +1,293 @@ +{ + "input": { + "id": "3350_4052371f8d68dc662d8c5b07377d9aebcfccc0b9_20240201_014419", + "version": 3350, + "invocation_id": "e5312521-4cfa-4187-9320-b2664c8a372b", + "commit_sha": "4052371f8d68dc662d8c5b07377d9aebcfccc0b9", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": true, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350", + "receipt_handle": "AQEBJa6xECSrRxGxefxzEHORcKnNJPqgVJUhSWLjmdKWgDUpZ1PFsDKgG0s7sb4HtP7S1D1trzf9SaW1Y39s6r2xs/1pCABnG9jhUeTyH19bXrUae+NeDcfwd+A9Pd9tigC43sTXmLL77FQ9LwPLBRjlqzGLqivznP5AunzJa5uY98lrfGpyX9dISbJrLLhXuHQmYzOzZigv+On/0zDdt1/mfY0RpRVmZi7YKFgBNrA1G7zMMeN3T4MH3CNUnZgG89ZSzvT8+J1LOlpaS/diXiq+Eh3+Aico1pjwUy4kCmbX7ek5RSCSlGHf1zDH5vEoiMPdye0ddgJJKD7RDjw62121OmYFMn6XJiImOGjcU5JggXzxfd+Mua4uWaD7k9KvcJG/0mcwC5zalyDPZ7rsBoWN70W7PpFnMjUv4M47/1bQJ+zcNf0nZjTj5c4XiEFkkcMD" + }, + "validations": { + "queries": { + "pre": { + "node_counts": [ + { + "node": "GFE", + "count": 1000 + }, + { + "node": "IPD_Accession", + "count": 1000 + }, + { + "node": "IPD_Allele", + "count": 1000 + }, + { + "node": "Sequence", + "count": 1000 + }, + { + "node": "Feature", + "count": 1087 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3350, + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.35.0", + "count": 1000 + } + ] + }, + "post": { + "node_counts": [ + { + "node": "GFE", + "count": 1000 + }, + { + "node": "IPD_Accession", + "count": 1000 + }, + { + "node": "IPD_Allele", + "count": 1000 + }, + { + "node": "Sequence", + "count": 1000 + }, + { + "node": "Feature", + "count": 1087 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3350, + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.35.0", + "count": 1000 + } + ] + } + }, + "check_existing_build": { + "Contents": [ + { + "ETag": "\"bea92fb773c3300a5bed48d52f654fd2\"", + "Key": "data/3350/csv/all_cds.3350.csv", + "LastModified": "2024-01-28T20:10:04Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 1182753, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"11080ce94680a48657e790c1f1496d6e\"", + "Key": "data/3350/csv/all_features.3350.csv", + "LastModified": "2024-01-28T20:10:04Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 2394539, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"a55b30240ee5107cd40378acd1bd3d8d\"", + "Key": "data/3350/csv/all_groups.3350.csv", + "LastModified": "2024-01-28T20:10:04Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 284170, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"7a84b882607600972a18d6a7954fe757\"", + "Key": "data/3350/csv/gfe_sequences.3350.csv", + "LastModified": "2024-01-28T20:10:04Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 1710669, + "StorageClass": "STANDARD" + } + ], + "IsTruncated": false, + "Marker": "", + "MaxKeys": 1000, + "Name": "nmdpf-gfe-db-810526023897-us-east-1", + "Prefix": "data/3350/csv/" + }, + "build_outputs": { + "release_version": 3350, + "details": [ + { + "schema": "all_cds", + "release": 3350, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_cds.3350.csv", + "cols": [ + "gfe_name", + "bp_seq_id", + "bp_sequence", + "aa_seq_id", + "aa_sequence" + ], + "num_rows": 1000, + "created_utc": "2024-01-28T20:10:04.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_features", + "release": 3350, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_features.3350.csv", + "cols": [ + "accession", + "hash_code", + "locus", + "rank", + "sequence", + "term", + "gfe_name", + "allele_id", + "hla_name", + "imgt_release" + ], + "num_rows": 7728, + "created_utc": "2024-01-28T20:10:04.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_groups", + "release": 3350, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_groups.3350.csv", + "cols": [ + "gfe_name", + "allele_id", + "hla_name", + "ard_id", + "ard_name", + "locus", + "imgt_release" + ], + "num_rows": 3000, + "created_utc": "2024-01-28T20:10:04.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "gfe_sequences", + "release": 3350, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/gfe_sequences.3350.csv", + "cols": [ + "gfe_name", + "acc_name", + "locus", + "hla_name", + "seq_id", + "sequence", + "length", + "imgt_release" + ], + "num_rows": 1000, + "created_utc": "2024-01-28T20:10:04.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + } + ], + "errors": [], + "expected_artifacts": [ + "all_cds.3350.csv", + "all_features.3350.csv", + "all_groups.3350.csv", + "gfe_sequences.3350.csv" + ], + "is_valid_build": true + }, + "load_results": { + "is_load_successful": { + "value": true, + "details": { + "unique_releases_in_db_pre_load": [ + 3350 + ], + "unique_releases_in_db_post_load": [ + 3350 + ] + } + } + } + }, + "state": { + "created_utc": "2024-01-26T19:18:26.296Z", + "updated_utc": "2024-02-01T01:44:19.498Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "4052371f8d68dc662d8c5b07377d9aebcfccc0b9", + "date_utc": "2019-04-17T09:10:57.000Z", + "message": "Merge pull request #174 from ANHIG/3360\n\n3360", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4052371f8d68dc662d8c5b07377d9aebcfccc0b9" + }, + "execution": { + "status": "LOAD_SUCCESS" + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/format_results/failure-event.json b/gfe-db/pipeline/functions/format_results/failure-event.json new file mode 100644 index 00000000..af3272de --- /dev/null +++ b/gfe-db/pipeline/functions/format_results/failure-event.json @@ -0,0 +1,34 @@ +{ + "Error": "States.Timeout", + "Cause": "", + "state": { + "created_utc": "2024-01-25T17:24:09.819Z", + "updated_utc": "2024-01-26T03:45:29.853Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "522c1fdd2b79fa7caa628ec48f6654342cb77045", + "date_utc": "2024-01-11T14:54:13.000Z", + "message": "Merge pull request #355 from ANHIG/3540\n\nRelease of new hla.xml format as hla_new.xml and hla_new.xsd for 3.54.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/522c1fdd2b79fa7caa628ec48f6654342cb77045" + }, + "execution": { + "id": "3540_522c1fdd2b79fa7caa628ec48f6654342cb77045_20240126_034427", + "invocation_id": "88561981-52bb-4cdd-a4a8-1a4863571dfa", + "version": 3540, + "status": "FAILED", + "date_utc": "2024-01-26T03:44:28.959Z", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + } + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/format_results/requirements.txt b/gfe-db/pipeline/functions/format_results/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/format_results/success-event.json b/gfe-db/pipeline/functions/format_results/success-event.json new file mode 100644 index 00000000..7db1b2db --- /dev/null +++ b/gfe-db/pipeline/functions/format_results/success-event.json @@ -0,0 +1,316 @@ +{ + "input": { + "id": "3550_df6ba6f80a2c5f999590f06fced6c4c4ff56b89d_20240125_004900", + "version": 3550, + "invocation_id": "f8f9f11f-0cd9-4534-aae0-1f8dfb5e3a93", + "commit_sha": "df6ba6f80a2c5f999590f06fced6c4c4ff56b89d", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": true, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3550", + "receipt_handle": "AQEBryg9b59RKrfiGDnX2ModzQ8bXRd1Br05EBWUYmZ5jWh8jVI2ZjBkTeJtkUZwrqdQsgvl2M/hQqXQBP2Zw7FEVpRHvHvE7g1NbJH78lzgAuW01Nee+3pBAkqKVP9HZWScMPZuKqaO49Ow8LEkD84mv/qoR+TpbneWnT8y2QrYOjAzWha81ciNk2eDOo55lyCPMeRGRFolZTgPAZw5UHJ3hq5Jvwto0QMgDesD5aHYXugZaaG5+prwmtWDRLZRnFpW1PsZOkuEu9c28g6NG7+7s8NwLZ9Fp2imvjEGfDz4dnn5y2ee+qMmoCjfCOFGrSUjotDaAcJQCybEvPLHbsUmvEfOubcJJBUr/MURLazWddVHuCtvnKATJX7e0uZiDocXrjopNep36tdn/0Uhike9uKVqbW7qHE/M1nlqHfZeCmSoK4jw44YUVDqnSBy2wDvl" + }, + "validations": { + "queries": { + "pre": { + "node_counts": [ + { + "node": "GFE", + "count": 0 + }, + { + "node": "IPD_Accession", + "count": 0 + }, + { + "node": "IPD_Allele", + "count": 0 + }, + { + "node": "Sequence", + "count": 0 + }, + { + "node": "Feature", + "count": 0 + }, + { + "node": "Submitter", + "count": 0 + } + ], + "has_ipd_allele_release_counts": [], + "ipd_accession_release_counts": [] + }, + "post": { + "node_counts": [ + { + "node": "GFE", + "count": 1000 + }, + { + "node": "IPD_Accession", + "count": 1000 + }, + { + "node": "IPD_Allele", + "count": 1000 + }, + { + "node": "Sequence", + "count": 1000 + }, + { + "node": "Feature", + "count": 1059 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3550, + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.55.0", + "count": 1000 + } + ] + } + }, + "check_existing_build": { + "Contents": [ + { + "ETag": "\"02a4756ba462e482c4db31c0fdb9170c\"", + "Key": "data/3550/csv/all_cds.3550.csv", + "LastModified": "2024-01-24T21:22:47Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 1399196, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"e7e7a74b3dfc3504e61cac5c1489b4d8\"", + "Key": "data/3550/csv/all_features.3550.csv", + "LastModified": "2024-01-24T21:22:47Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 3810345, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"7c2d7c40a3a06cb7cd790f7d0db7983b\"", + "Key": "data/3550/csv/all_groups.3550.csv", + "LastModified": "2024-01-24T21:22:47Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 293142, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"ebf679c17761b3e5bc73734a68e0952d\"", + "Key": "data/3550/csv/gfe_sequences.3550.csv", + "LastModified": "2024-01-24T21:22:47Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 2582950, + "StorageClass": "STANDARD" + } + ], + "IsTruncated": false, + "Marker": "", + "MaxKeys": 1000, + "Name": "nmdpf-gfe-db-810526023897-us-east-1", + "Prefix": "data/3550/csv/" + }, + "build_outputs": { + "release_version": 3550, + "details": [ + { + "schema": "all_cds", + "release": 3550, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3550/csv/all_cds.3550.csv", + "cols": [ + "gfe_name", + "bp_seq_id", + "bp_sequence", + "aa_seq_id", + "aa_sequence" + ], + "num_rows": 1000, + "created_utc": "2024-01-24T21:22:47.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_features", + "release": 3550, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3550/csv/all_features.3550.csv", + "cols": [ + "accession", + "hash_code", + "locus", + "rank", + "sequence", + "term", + "gfe_name", + "allele_id", + "hla_name", + "imgt_release" + ], + "num_rows": 12979, + "created_utc": "2024-01-24T21:22:47.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_groups", + "release": 3550, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3550/csv/all_groups.3550.csv", + "cols": [ + "gfe_name", + "allele_id", + "hla_name", + "ard_id", + "ard_name", + "locus", + "imgt_release" + ], + "num_rows": 3000, + "created_utc": "2024-01-24T21:22:47.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "gfe_sequences", + "release": 3550, + "file_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3550/csv/gfe_sequences.3550.csv", + "cols": [ + "gfe_name", + "acc_name", + "locus", + "hla_name", + "seq_id", + "sequence", + "length", + "imgt_release" + ], + "num_rows": 1000, + "created_utc": "2024-01-24T21:22:47.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + } + ], + "errors": [], + "expected_artifacts": [ + "all_cds.3550.csv", + "all_features.3550.csv", + "all_groups.3550.csv", + "gfe_sequences.3550.csv" + ], + "is_valid_build": true + }, + "load_results": { + "is_load_successful": { + "value": true, + "details": { + "unique_releases_in_db_pre_load": [], + "unique_releases_in_db_post_load": [ + 3550 + ] + } + }, + "have_node_counts_increased": { + "value": true, + "details": { + "node_counts_pre_load": 0, + "node_counts_post_load": 5060 + } + }, + "has_unique_release_count_increased_by_1": { + "value": true, + "details": { + "num_unique_releases_in_db_pre_load": 0, + "num_unique_releases_in_db_post_load": 1 + } + } + } + }, + "backups": { + "pre": { + "document_name": "nmdpf-gfe-db-database-Neo4jBackupDocument-88XzqAh3CCKI", + "command_id": "30d2248e-67f4-4afd-93ff-42808241cdfa" + }, + "post": { + "document_name": "nmdpf-gfe-db-database-Neo4jBackupDocument-88XzqAh3CCKI", + "command_id": "c24e2a02-11a1-403c-897a-d868b206c751" + } + }, + "state": { + "created_utc": "2024-01-24T21:18:03.171Z", + "updated_utc": "2024-01-25T00:49:00.135Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "df6ba6f80a2c5f999590f06fced6c4c4ff56b89d", + "date_utc": "2024-01-18T11:06:05.000Z", + "message": "Merge pull request #359 from ANHIG/3550\n\nRemoval of empty sequence block in A_prot.txt and MICA_prot.txt", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/df6ba6f80a2c5f999590f06fced6c4c4ff56b89d" + }, + "execution": { + "id": "3550_df6ba6f80a2c5f999590f06fced6c4c4ff56b89d_20240125_004900", + "invocation_id": "f8f9f11f-0cd9-4534-aae0-1f8dfb5e3a93", + "version": 3550, + "status": "LOAD_SUCCESS", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + } + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/get_execution_state/__init__.py b/gfe-db/pipeline/functions/get_execution_state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/get_execution_state/app.py b/gfe-db/pipeline/functions/get_execution_state/app.py new file mode 100644 index 00000000..321f1829 --- /dev/null +++ b/gfe-db/pipeline/functions/get_execution_state/app.py @@ -0,0 +1,53 @@ +"""In progress""" +import os + +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) +import logging +import json +from gfedbmodels.types import ( + ExecutionPayloadItem, + ExecutionStateItem +) +from gfedbmodels.constants import ( + session, + pipeline +) + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +dynamodb = session.resource("dynamodb") +table = dynamodb.Table(pipeline.params.GfeDbExecutionStateTableName) + +def lambda_handler(event, context): + logger.info(json.dumps(event)) + + # validate input + execution_payload_item = ExecutionPayloadItem(**event['input']) + + commit_state = table.get_item( + Key={ + "commit__sha": execution_payload_item.commit_sha, + "execution__version": execution_payload_item.version + } + )['Item'] + + # Validate record with pydantic model + execution_state_item = ExecutionStateItem.from_execution_state_item_json(commit_state) + + # TODO validate that table state matches the state machine input + + return execution_state_item.model_dump(exclude_none=True) + + +if __name__ == "__main__": + from pathlib import Path + + event = json.loads((Path(__file__).parent / "error.json").read_text()) + + lambda_handler(event, None) diff --git a/gfe-db/pipeline/functions/get_execution_state/error.json b/gfe-db/pipeline/functions/get_execution_state/error.json new file mode 100644 index 00000000..5381aa0f --- /dev/null +++ b/gfe-db/pipeline/functions/get_execution_state/error.json @@ -0,0 +1,66 @@ +{ + "input": { + "version": 3530, + "commit_sha": "a549c5b255da7d988a70cf7f7a2bdb5b1ae5b00a", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": true, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3530", + "receipt_handle": "AQEBze8gzWpZS33AFry1ZxWNhnwuN9jmk4jx/ifNTGVzUqwZJ1VbPpzGfuPjmGKmF4L2aONURoEGf4RbDiH6MaqkOpND+JfRcCw+FUyGPRlIaB8GqhMt+Sp/frQu51JhPQIDDwkNYztl3gdm3e59HoUEK6KweBthVnW0pog2Xw3rLy6qsF19zAMU0b1NyQlSzwa48K7OsllTbGEErUQBeGBIBLNYDffD9+TCgK6ihKhN5sKhn584OpVyc8/v1WWSccBLNujmRMM36k3LnEhMkuAlK/YddWhjQCxOr7v1ttKDX2qRVGJgcgxfjJYIuAgHdPhKKWK5Omq6cjdbYdIVvAtTGLL9WpRBgYB6N7tbANDme9UcYCCbZXd19ylE3xWP6NoY2JcLC2BHitp3GUl6mHyy3CnYuU5Fc3x2QPWmUv6RHuowdw778xGhGQRcE2gnPYWi" + }, + "validations": { + "queries": { + "pre": { + "node_counts": [ + { + "node": "GFE", + "count": 1000 + }, + { + "node": "IPD_Accession", + "count": 1000 + }, + { + "node": "IPD_Allele", + "count": 1000 + }, + { + "node": "Sequence", + "count": 1000 + }, + { + "node": "Feature", + "count": 1093 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": "3540", + "count": 1000 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.54.0", + "count": 1000 + } + ] + } + } + }, + "backups": { + "pre": { + "document_name": "nmdpf-gfe-db-database-Neo4jBackupDocument-dlK48Aa4AoxM", + "command_id": "741125dd-7a54-46a7-ab95-28b6bdbc4489" + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/get_execution_state/event.json b/gfe-db/pipeline/functions/get_execution_state/event.json new file mode 100644 index 00000000..c8400211 --- /dev/null +++ b/gfe-db/pipeline/functions/get_execution_state/event.json @@ -0,0 +1,12 @@ +{ + "version": 3530, + "commit_sha": "257023f6884c7be64f80212d5b7e5b5090074675", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": -1 + }, + "s3_path": "s3://dev-gfe-db-531868584498-us-east-1/data/3530", + "receipt_handle": "AQEBkRdT2cx82fj2O8radepmhsBw2i4NuF3Ip9sMBOHjAC4IUs+Hg1d0uElM/mznzyP97Zb2m60bYlk0ggt5IogKrxSnvLgT3ZbOZAXnF5Zqyy5t7vVOavXD1A3OC89KtcZO+3vCtpoQ1NgQgxGvOod6ruMf68JP+xMjLMFvikSIG42f7eVLy6C3yswenjd4KrDrCWNfSmEBk5iNf1CxfVbVdbonGKVZmmVU6AsZxJiGrMBWoSVOyQveMKcPJYuUm1wN1eWAcPe5rgXujqU2k+1fCeonK1h/wgm0HDX62Y5g8PShzt5rxGBOAphDresUr3k0JrN/0u1nw4MR1WTy3y3SH+wPtCMKsR4x70QXspB9Ic2WZ09ATXIfsC8OqpFqX4IGu2IcM2pC7IdMn5YgFuiDig95fTjBTjeiLqG/fvU6tXF/nmK2TVnzVE9NlaFhXtnJ" +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/get_execution_state/requirements.txt b/gfe-db/pipeline/functions/get_execution_state/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_backup_script/app.py b/gfe-db/pipeline/functions/invoke_backup_script/app.py index 9b10741e..be91868d 100644 --- a/gfe-db/pipeline/functions/invoke_backup_script/app.py +++ b/gfe-db/pipeline/functions/invoke_backup_script/app.py @@ -67,6 +67,7 @@ def lambda_handler(event, context): logger.error(err) raise err + # TODO add field for the backup asset's S3 path return { "document_name": response['Command']['DocumentName'], "command_id": response['Command']['CommandId'] diff --git a/gfe-db/pipeline/functions/invoke_lcm/__init__.py b/gfe-db/pipeline/functions/invoke_lcm/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_lcm/app.py b/gfe-db/pipeline/functions/invoke_lcm/app.py new file mode 100644 index 00000000..d04eb125 --- /dev/null +++ b/gfe-db/pipeline/functions/invoke_lcm/app.py @@ -0,0 +1,77 @@ +"""This function is invoked through SNS when GfeDbLoadQueueHasMessagesAlarm is triggered from messages present in the GfeDbLoadQueue. +It's only responsibility is to invoke the LoadConcurrencyManager state machine which maintains a concurrency of 1 for loading Neo4j +to avoid clashes with concurrent executions in GfeDbUpdatePipeline. This allows GfeDbUpdatePipeline to run data builds at concurrency > 1 +and keeps data loads at concurrency = 1. The LoadConcurrencyManager will end the execution when GfeDbLoadQueueHasMessagesAlarm enters +OK state, meaning there are no more messages in the queue. All requests for loading Neo4j are handled by the LoadConcurrencyManager +state machine. +""" + +import os +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) + +import logging +from datetime import datetime +import json +from gfedbmodels.constants import ( + session, + pipeline) + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +APP_NAME = os.environ["APP_NAME"] +STAGE = os.environ["STAGE"] +AWS_REGION = os.environ["AWS_REGION"] + +states = session.client("stepfunctions", region_name=AWS_REGION) + +update_pipeline_state_machine_arn = pipeline.params.UpdatePipelineStateMachineArn +lcm_state_machine_arn = pipeline.params.LoadConcurrencyManagerStateMachineArn + +def lambda_handler(event, context): + + logger.info(json.dumps(event)) + + alarm_message = json.loads(event["Records"][0]["Sns"]["Message"]) + + # Validate the alarm state is IN ALARM + state_has_changed = "NewStateValue" in alarm_message + is_in_alarm = alarm_message["NewStateValue"] == "ALARM" + + if state_has_changed and is_in_alarm: + + # TODO query the state table for commits with PENDING status to get the invocation_id for the LCM's execution_id + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + execution_id = update_pipeline_state_machine_arn.split(":")[-1] + "_" + timestamp + + if not executions_in_progress(lcm_state_machine_arn): + response = states.start_execution( + stateMachineArn=lcm_state_machine_arn, + name=execution_id + ) + + return { + "statusCode": 200 + } + + +def executions_in_progress(state_machine_arn): + # List executions for the state machine + response = states.list_executions( + stateMachineArn=state_machine_arn, + statusFilter="RUNNING" + ) + + return bool(response['executions']) + + +if __name__ == "__main__": + from pathlib import Path + + event = json.loads((Path(__file__).parent / "gfedbloadqueue-sns-event.json").read_text()) + lambda_handler(event, "") diff --git a/gfe-db/pipeline/functions/invoke_lcm/gfedbloadqueue-sns-event.json b/gfe-db/pipeline/functions/invoke_lcm/gfedbloadqueue-sns-event.json new file mode 100644 index 00000000..39ed4dc7 --- /dev/null +++ b/gfe-db/pipeline/functions/invoke_lcm/gfedbloadqueue-sns-event.json @@ -0,0 +1,22 @@ +{ + "Records": [ + { + "EventSource": "aws:sns", + "EventVersion": "1.0", + "EventSubscriptionArn": "arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-infrastructure-DataPipelineExecutionTopic-IvepeIFXVeQp:439538cf-dab9-4011-b15e-a2139f1f44df", + "Sns": { + "Type": "Notification", + "MessageId": "e88cc407-8efc-55b4-b21a-9a3264a0dd73", + "TopicArn": "arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-infrastructure-DataPipelineExecutionTopic-IvepeIFXVeQp", + "Subject": "ALARM: \"nmdpf-gfe-db-pipeline-UpdatePipelineStateMachineExecutionAlarm-...\" in US East (N. Virginia)", + "Message": "{\"AlarmName\":\"nmdpf-gfe-db-pipeline-UpdatePipelineStateMachineExecutionAlarm-uh4zKvGEDOic\",\"AlarmDescription\":\"arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-juju8JKTlNbt state machine execution in progress\",\"AWSAccountId\":\"810526023897\",\"AlarmConfigurationUpdatedTimestamp\":\"2024-01-28T04:06:48.361+0000\",\"NewStateValue\":\"ALARM\",\"NewStateReason\":\"Threshold Crossed: 1 datapoint [1.0 (31/01/24 18:51:00)] was greater than the threshold (0.0).\",\"StateChangeTime\":\"2024-02-01T18:51:08.387+0000\",\"Region\":\"US East (N. Virginia)\",\"AlarmArn\":\"arn:aws:cloudwatch:us-east-1:810526023897:alarm:nmdpf-gfe-db-pipeline-UpdatePipelineStateMachineExecutionAlarm-uh4zKvGEDOic\",\"OldStateValue\":\"OK\",\"OKActions\":[\"arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-infrastructure-DataPipelineExecutionTopic-IvepeIFXVeQp\"],\"AlarmActions\":[\"arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-infrastructure-DataPipelineExecutionTopic-IvepeIFXVeQp\"],\"InsufficientDataActions\":[],\"Trigger\":{\"Period\":86400,\"EvaluationPeriods\":1,\"ComparisonOperator\":\"GreaterThanThreshold\",\"Threshold\":0.0,\"TreatMissingData\":\"\",\"EvaluateLowSampleCountPercentile\":\"\",\"Metrics\":[{\"Id\":\"m1\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-juju8JKTlNbt\",\"name\":\"StateMachineArn\"}],\"MetricName\":\"ExecutionsStarted\",\"Namespace\":\"AWS/States\"},\"Period\":86400,\"Stat\":\"Sum\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Id\":\"m2\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-juju8JKTlNbt\",\"name\":\"StateMachineArn\"}],\"MetricName\":\"ExecutionsSucceeded\",\"Namespace\":\"AWS/States\"},\"Period\":86400,\"Stat\":\"Sum\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Id\":\"m3\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-juju8JKTlNbt\",\"name\":\"StateMachineArn\"}],\"MetricName\":\"ExecutionsFailed\",\"Namespace\":\"AWS/States\"},\"Period\":86400,\"Stat\":\"Sum\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Id\":\"m4\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-juju8JKTlNbt\",\"name\":\"StateMachineArn\"}],\"MetricName\":\"ExecutionsAborted\",\"Namespace\":\"AWS/States\"},\"Period\":86400,\"Stat\":\"Sum\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Expression\":\"m1 - m2 - m3 - m4\",\"Id\":\"e1\",\"Label\":\"ExecutionsInProgress\",\"ReturnData\":true}]}}", + "Timestamp": "2024-02-01T18:51:08.430Z", + "SignatureVersion": "1", + "Signature": "T/w1PJC6cEbCI+C7Nh7n2oq+JstGEAi3jopjXsxvLwogBdxCRVaBmC8juD8Igwi9F6Y9lAUrrAwvADk9Wp+BwX19TdKCxmTm2fzG/HfHtoA6e7XOud7kN0sB0KVCOEnHzMoebDPRPBU18CuwnkDKF8VhffNQb9bdfMJBK3wZG5V2ol/y+ZDfDBHKiNrLFawIUwhCTDzTriZiSgfatLFLK1VDkUtB+YCummdS57sYANjqRThbsr/Qe63qNKn45sMA51zwKD4ThQfuoL3n/nYfKVflKuiq1sty9LsHbVSfkvtqfROmVRDFygv7O1VpttwDcdEzCXNR2mR7dc7xBVL5AA==", + "SigningCertUrl": "https://sns.us-east-1.amazonaws.com/SimpleNotificationService-60eadc530605d63b8e62a523676ef735.pem", + "UnsubscribeUrl": "https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-infrastructure-DataPipelineExecutionTopic-IvepeIFXVeQp:439538cf-dab9-4011-b15e-a2139f1f44df", + "MessageAttributes": {} + } + } + ] +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_lcm/requirements.txt b/gfe-db/pipeline/functions/invoke_lcm/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_lcm/updatepipelineexecutions-sns-event.json b/gfe-db/pipeline/functions/invoke_lcm/updatepipelineexecutions-sns-event.json new file mode 100644 index 00000000..069b2bdd --- /dev/null +++ b/gfe-db/pipeline/functions/invoke_lcm/updatepipelineexecutions-sns-event.json @@ -0,0 +1,22 @@ +{ + "Records": [ + { + "EventSource": "aws:sns", + "EventVersion": "1.0", + "EventSubscriptionArn": "arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-database-GfeDbLoadQueueHasMessagesAlarmTopic-aUbhIif9d3xp:38bef3de-863d-4bcb-9fa7-98c3a14009e0", + "Sns": { + "Type": "Notification", + "MessageId": "09bdffd0-3b3f-5a17-9d3f-a1ff59b61b09", + "TopicArn": "arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-database-GfeDbLoadQueueHasMessagesAlarmTopic-aUbhIif9d3xp", + "Subject": "ALARM: \"GfeDbLoadQueueHasMessages\" in US East (N. Virginia)", + "Message": "{\"AlarmName\":\"GfeDbLoadQueueHasMessages\",\"AlarmDescription\":\"Alarm when the GfeDbLoadQueue has messages\",\"AWSAccountId\":\"810526023897\",\"AlarmConfigurationUpdatedTimestamp\":\"2024-01-27T01:19:09.671+0000\",\"NewStateValue\":\"ALARM\",\"NewStateReason\":\"Threshold Crossed: 1 datapoint [1.0 (27/01/24 19:57:00)] was greater than the threshold (0.0).\",\"StateChangeTime\":\"2024-01-27T19:58:01.417+0000\",\"Region\":\"US East (N. Virginia)\",\"AlarmArn\":\"arn:aws:cloudwatch:us-east-1:810526023897:alarm:GfeDbLoadQueueHasMessages\",\"OldStateValue\":\"INSUFFICIENT_DATA\",\"OKActions\":[],\"AlarmActions\":[\"arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-database-GfeDbLoadQueueHasMessagesAlarmTopic-aUbhIif9d3xp\"],\"InsufficientDataActions\":[],\"Trigger\":{\"Period\":30,\"EvaluationPeriods\":1,\"ComparisonOperator\":\"GreaterThanThreshold\",\"Threshold\":0.0,\"TreatMissingData\":\"\",\"EvaluateLowSampleCountPercentile\":\"\",\"Metrics\":[{\"Id\":\"visible\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"nmdpf-gfe-db-database-GfeDbLoadQueue-RLiPEqMuza6m.fifo\",\"name\":\"QueueName\"}],\"MetricName\":\"ApproximateNumberOfMessagesVisible\",\"Namespace\":\"AWS/SQS\"},\"Period\":30,\"Stat\":\"Average\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Id\":\"notVisible\",\"MetricStat\":{\"Metric\":{\"Dimensions\":[{\"value\":\"nmdpf-gfe-db-database-GfeDbLoadQueue-RLiPEqMuza6m.fifo\",\"name\":\"QueueName\"}],\"MetricName\":\"ApproximateNumberOfMessagesNotVisible\",\"Namespace\":\"AWS/SQS\"},\"Period\":30,\"Stat\":\"Average\",\"Unit\":\"Count\"},\"ReturnData\":false},{\"Expression\":\"visible + notVisible\",\"Id\":\"e1\",\"Label\":\"QueueHasMessages\",\"ReturnData\":true}]}}", + "Timestamp": "2024-01-27T19:58:01.469Z", + "SignatureVersion": "1", + "Signature": "mb3aeSln+mdH4iv+g3CG0d5gRocC3ICjulZXNmZs3J90Sf/E21OvH8/B3wS54wWuH/pVbjxa//Jo/AUXaKGS2rTK8ZgFy+tO579mooYFhmOurKU3wkFHyCRtLLjwvtSvBlf50cYYqSvDN8NwwwJuAH8tXIC4+GSvdAr6BGQxL2S2hfpcEGNtLR02aiy2npWpPjmTVKAAwpwM/jpIWkUdQcG9AMGgTCtHg7iciWlgXhou1821WmT9Ti/HviR8D76im4cQ7Seaj4MoM78J8cI2rkFbNmfdvTpn3pvd64n1katKyBuOH3WWCbIpaSGwWICT92Ey/715UD89NHm+/uLkPQ==", + "SigningCertUrl": "https://sns.us-east-1.amazonaws.com/SimpleNotificationService-60eadc530605d63b8e62a523676ef735.pem", + "UnsubscribeUrl": "https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:810526023897:nmdpf-gfe-db-database-GfeDbLoadQueueHasMessagesAlarmTopic-aUbhIif9d3xp:38bef3de-863d-4bcb-9fa7-98c3a14009e0", + "MessageAttributes": {} + } + } + ] +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_load_script/__init__.py b/gfe-db/pipeline/functions/invoke_load_script/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_load_script/app.py b/gfe-db/pipeline/functions/invoke_load_script/app.py index 832a45d0..a4b0f84e 100644 --- a/gfe-db/pipeline/functions/invoke_load_script/app.py +++ b/gfe-db/pipeline/functions/invoke_load_script/app.py @@ -1,30 +1,35 @@ import os +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) + import logging import json -import boto3 +from gfedbmodels.constants import ( + session, + pipeline, + database +) logger = logging.getLogger() logger.setLevel(logging.INFO) -neo4j_load_query_document_name_param = os.environ["NEO4J_LOAD_QUERY_DOCUMENT_NAME_SSM_PARAM"] -neo4j_database_instance_id_param = os.environ["NEO4J_DATABASE_INSTANCE_ID_SSM_PARAM"] -load_neo4j_activity = os.environ["LOAD_NEO4J_ACTIVITY"] -app_name = os.environ["APP_NAME"] +neo4j_load_query_document_name = pipeline.params.Neo4jLoadQueryDocumentName +neo4j_database_instance_id = database.params.Neo4jDatabaseInstanceId # Get SSM Document Neo4jLoadQuery -ssm = boto3.client('ssm', region_name=os.environ["AWS_REGION"]) -neo4j_load_query_document_name = ssm.get_parameter(Name=neo4j_load_query_document_name_param)["Parameter"]["Value"] +ssm = session.clients["ssm"] response = ssm.get_document(Name=neo4j_load_query_document_name) neo4j_load_query_document_content = json.loads(response["Content"]) -# Get Instance ID -neo4j_database_instance_id = ssm.get_parameter(Name=neo4j_database_instance_id_param)["Parameter"]["Value"] - # Extract document parameters neo4j_load_query_document_parameters = neo4j_load_query_document_content["parameters"] -command_line_default = neo4j_load_query_document_parameters["commandLine"]["default"] source_info_default = neo4j_load_query_document_parameters["sourceInfo"]["default"] + def lambda_handler(event, context): """Invoke SSM Run Command for server side loading on Neo4j @@ -34,7 +39,7 @@ def lambda_handler(event, context): --document-name "dev-gfe-db-database-Neo4jLoadQueryDocument-UgYcOg48yiQB" \ --document-version "1" \ --targets '[{"Key":"InstanceIds","Values":["i-0f8ec07e314226283"]}]' \ - --parameters '{"executionTimeout":["3600"],"sourceInfo":["{\"path\":\"https://.s3.amazonaws.com/config/scripts/load_db.sh\"}"],"sourceType":["S3"],"workingDirectory":["/home/ec2-user"],"commandLine":["bash load_db.sh"]}' \ + --parameters '{"executionTimeout":["3600"],"sourceInfo":["{\"path\":\"https://.s3.amazonaws.com/config/database/scripts/load_db.sh\"}"],"sourceType":["S3"],"workingDirectory":["/home/ec2-user"],"LoadEvent":["{\"key\":\"value\"}"]}' \ --timeout-seconds 600 \ --max-concurrency "50" \ --max-errors "0" \ @@ -45,16 +50,8 @@ def lambda_handler(event, context): logger.info(json.dumps(event)) - # Update params for this execution - params = { - "params": { - "app_name": app_name, - "activity_arn": load_neo4j_activity, - } - } - - # Include params JSON as command line argument - cmd = f"{command_line_default} \'{json.dumps(params)}\'" + # TODO BOOKMARK 5/31/23: Check if Neo4jLoadQueryDocument is already running, if it is exit 0 (makes service idempotent) + # Note: Neo4jLoadQueryDocument only needs to be triggered once and it will fetch the next release until there are no more left try: response = ssm.send_command( @@ -63,38 +60,46 @@ def lambda_handler(event, context): ], DocumentName=neo4j_load_query_document_name, Parameters={ - "commandLine":[cmd], - "sourceInfo":[json.dumps(source_info_default)] + "sourceType": ["S3"], + "sourceInfo": [json.dumps(source_info_default)], + "workingDirectory": ["/home/ec2-user"], + "executionTimeout": ["28800"], + "LoadEvent": [json.dumps(event)], }, - MaxConcurrency='1', - CloudWatchOutputConfig={ - 'CloudWatchOutputEnabled': True - }) + MaxConcurrency="1", + CloudWatchOutputConfig={"CloudWatchOutputEnabled": True}, + ) - if response['ResponseMetadata']['HTTPStatusCode'] != 200: + if response["ResponseMetadata"]["HTTPStatusCode"] != 200: logger.error(json.dumps(response, cls=DatetimeEncoder)) - message = f"Failed to send command `{cmd}` to instance {neo4j_database_instance_id}" + message = f"Failed to send command to instance {neo4j_database_instance_id}" raise Exception("Failed to send command") else: - message = f"Command `{cmd}` invoked on instance {neo4j_database_instance_id}" + message = f"Command invoked on instance {neo4j_database_instance_id}" logger.info(message) + return { + "message": message, + "sqs": event["sqs"], + "ssm": { + "CommandId": response["Command"]["CommandId"], + "InstanceId": neo4j_database_instance_id, + } + } + except Exception as err: logger.error(err) raise err - return { - "message": message - } -# Needed to serialize datetime objects in JSON responses +# Serializes datetime objects in JSON responses class DatetimeEncoder(json.JSONEncoder): """ Helps convert datetime objects to pure strings in AWS service API responses. Does not convert timezone information. - Extend `json.JSONEncoder`. + Extend `json.JSONEncoder`. """ def default(self, obj): @@ -107,9 +112,9 @@ def default(self, obj): if __name__ == "__main__": from pathlib import Path - event_path = Path(__file__).parent / "error-event.json" + event_path = Path(__file__).parent / "event.json" with open(event_path, "r") as file: event = json.load(file) - lambda_handler(event,"") + lambda_handler(event, "") diff --git a/gfe-db/pipeline/functions/invoke_load_script/event.json b/gfe-db/pipeline/functions/invoke_load_script/event.json index 7e6c83a5..b4d7733d 100644 --- a/gfe-db/pipeline/functions/invoke_load_script/event.json +++ b/gfe-db/pipeline/functions/invoke_load_script/event.json @@ -1,7 +1,280 @@ { - "ALIGN": "False", - "KIR": "False", - "MEM_PROFILE": "False", - "LIMIT": "100", - "RELEASES": "3470" -} \ No newline at end of file + "backups": { + "pre": { + "document_name": "nmdpp-gfe-db-database-Neo4jBackupDocument-9X7UOkmPUPW1", + "command_id": "127870fc-cb84-4cd3-aa36-747d5cc5fbba" + } + }, + "sqs": { + "MessageId": "a878b74b-c6bb-4057-9b1e-05292f3cf10d", + "ReceiptHandle": "AQEBUJ0zJFmtPVSNU9/AqjROuAbeqeL6Mbg4iU3SUu/VmFbrWO7GpYbl9bKfPxzVrtW8PDlC/Dt0DQSfzg4T6kHhsFtRD7rj5t6TY3/5Ugb9AnCIgSBgm0F7XPp1Hw/GiCoOEdKlQ/zCfbJ1nR1lZgfz1q0GNvDZnYAY8haeQw12cxqbOvn0n+giqpSXwqZAnV5sr3r09c81la+vxsfGTXpY0+98/osgU330FtZfTHNQJG/cBr5hC2c1fb4j949xNpcxyquS7omSQCH1ptHBfK/cV7PurDHCCRBWhfENykCsFTgnJ+DkLQOBHrqZONJt05u77QRWsjEHfV5RFUW0SnyqWw==", + "MD5OfBody": "d608814dd540987afc6e3cd58c6c5af2", + "Body": { + "input": { + "id": "3560_10fd73e97ca3ba28e84b58f6aae586440ec8bc6f_20240915_050557", + "version": 3560, + "invocation_id": "0f122c24-b4e1-4f3e-82a4-6a5fb0a595d3", + "commit_sha": "10fd73e97ca3ba28e84b58f6aae586440ec8bc6f", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": -1, + "use_existing_build": true, + "skip_load": false + }, + "s3_path": "s3://nmdpp-gfe-db-810526023897-us-east-1/data/3560", + "receipt_handle": "AQEBcDIVQq6xPO03mTzYaVmnkY7Qg3njkQD1S0zOYm8ukapfuMjFNeJtdWuMKu/igSHRvEx+GLDi1VrkD8Gx+H1Sr8pJMG9sX3PZ4cEnQ5VEIy8gpt+m8QdsEYc7iHX9NURkOD8chXt21iwVGFJ45nPwC/ALNcmmpy5CxuoJzQrRYG3UjhLahfT4GWA8OQFKv5+W6VYgqathtSQJUhc7D8mQY8lU1j+i/ll2Ll4bhbu1Th5pIKyiM//y0WYbTWK9+uP5oJID7cQhsM+OhoCvadx7c+Zdmt9n0z+41ZHusfROMxwi+ezcVUDF8DFLIu4/V/rLGX0ftb5eZ9mSWVkYuGbiwg==" + }, + "validations": { + "queries": { + "pre": { + "node_counts": [ + { + "node": "GFE", + "count": 38640 + }, + { + "node": "IPD_Accession", + "count": 38640 + }, + { + "node": "IPD_Allele", + "count": 38360 + }, + { + "node": "Sequence", + "count": 38640 + }, + { + "node": "Feature", + "count": 39174 + }, + { + "node": "Submitter", + "count": 1 + } + ], + "has_ipd_allele_release_counts": [ + { + "release_version": 3550, + "count": 36762 + }, + { + "release_version": 3560, + "count": 37720 + }, + { + "release_version": 3570, + "count": 38316 + } + ], + "ipd_accession_release_counts": [ + { + "release_version": "3.55.0", + "count": 36762 + }, + { + "release_version": "3.56.0", + "count": 1104 + }, + { + "release_version": "3.57.0", + "count": 774 + } + ] + } + }, + "check_existing_build": { + "Contents": [ + { + "ETag": "\"898a10e4982984927c7292570861a952-6\"", + "Key": "data/3560/csv/all_cds.3560.csv", + "LastModified": "2024-09-13T23:20:52Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 45638720, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"93e0460fe3ae1ca026d8c140551e5583-17\"", + "Key": "data/3560/csv/all_features.3560.csv", + "LastModified": "2024-09-13T23:20:52Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 135303168, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"bda5142e1e5634541e1d13993382fa9f-2\"", + "Key": "data/3560/csv/all_groups.3560.csv", + "LastModified": "2024-09-13T23:20:52Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 11185560, + "StorageClass": "STANDARD" + }, + { + "ETag": "\"227172e93a81800a1fdce912abeb5038-13\"", + "Key": "data/3560/csv/gfe_sequences.3560.csv", + "LastModified": "2024-09-13T23:20:52Z", + "Owner": { + "DisplayName": "gclindsey+awstesting", + "Id": "c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611" + }, + "Size": 101976098, + "StorageClass": "STANDARD" + } + ], + "IsTruncated": false, + "Marker": "", + "MaxKeys": 1000, + "Name": "nmdpp-gfe-db-810526023897-us-east-1", + "Prefix": "data/3560/csv/" + }, + "build_outputs": { + "release_version": 3560, + "details": [ + { + "schema": "all_cds", + "release": 3560, + "file_path": "s3://nmdpp-gfe-db-810526023897-us-east-1/data/3560/csv/all_cds.3560.csv", + "cols": [ + "gfe_name", + "bp_seq_id", + "bp_sequence", + "aa_seq_id", + "aa_sequence" + ], + "num_rows": 37713, + "created_utc": "2024-09-13T23:20:52.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_features", + "release": 3560, + "file_path": "s3://nmdpp-gfe-db-810526023897-us-east-1/data/3560/csv/all_features.3560.csv", + "cols": [ + "accession", + "hash_code", + "locus", + "rank", + "sequence", + "term", + "gfe_name", + "allele_id", + "hla_name", + "imgt_release" + ], + "num_rows": 349965, + "created_utc": "2024-09-13T23:20:52.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "all_groups", + "release": 3560, + "file_path": "s3://nmdpp-gfe-db-810526023897-us-east-1/data/3560/csv/all_groups.3560.csv", + "cols": [ + "gfe_name", + "allele_id", + "hla_name", + "ard_id", + "ard_name", + "locus", + "imgt_release" + ], + "num_rows": 113160, + "created_utc": "2024-09-13T23:20:52.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + }, + { + "schema": "gfe_sequences", + "release": 3560, + "file_path": "s3://nmdpp-gfe-db-810526023897-us-east-1/data/3560/csv/gfe_sequences.3560.csv", + "cols": [ + "gfe_name", + "acc_name", + "locus", + "hla_name", + "seq_id", + "sequence", + "length", + "imgt_release" + ], + "num_rows": 37720, + "created_utc": "2024-09-13T23:20:52.000Z", + "details": { + "is_valid_csv_filename": true, + "is_valid_csv_header": true, + "is_valid_csv_rows": true + }, + "num_errors": 0, + "is_valid_csv": true + } + ], + "errors": [], + "expected_artifacts": [ + "all_cds.3560.csv", + "all_features.3560.csv", + "all_groups.3560.csv", + "gfe_sequences.3560.csv" + ], + "is_valid_build": true + } + }, + "state": { + "created_utc": "2024-09-15T05:05:57.960Z", + "updated_utc": "2024-09-15T05:05:57.806Z", + "repository": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA" + }, + "commit": { + "sha": "10fd73e97ca3ba28e84b58f6aae586440ec8bc6f", + "date_utc": "2024-06-14T08:52:52.000Z", + "message": "Update README.md", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/10fd73e97ca3ba28e84b58f6aae586440ec8bc6f" + }, + "execution": { + "id": "3560_10fd73e97ca3ba28e84b58f6aae586440ec8bc6f_20240915_050557", + "invocation_id": "0f122c24-b4e1-4f3e-82a4-6a5fb0a595d3", + "version": 3560, + "status": "LOAD_IN_PROGRESS", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": -1, + "use_existing_build": false, + "skip_load": false + } + } + } + } + } + } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_pipeline/app.py b/gfe-db/pipeline/functions/invoke_pipeline/app.py deleted file mode 100644 index 1b7d1379..00000000 --- a/gfe-db/pipeline/functions/invoke_pipeline/app.py +++ /dev/null @@ -1,350 +0,0 @@ -import os -import logging -import datetime -import copy -import json -import re -import requests -import numpy as np -import boto3 - -logger = logging.getLogger() -logger.setLevel(logging.INFO) - -AWS_REGION = os.environ["AWS_REGION"] -GITHUB_PERSONAL_ACCESS_TOKEN = os.environ["GITHUB_PERSONAL_ACCESS_TOKEN"] -GITHUB_REPOSITORY_OWNER = os.environ["GITHUB_REPOSITORY_OWNER"] -GITHUB_REPOSITORY_NAME = os.environ["GITHUB_REPOSITORY_NAME"] -DATA_BUCKET_NAME = os.environ["DATA_BUCKET_NAME"] - -# TODO: add to Makefile -PIPELINE_STATE_PATH = os.environ["PIPELINE_STATE_PATH"] -PIPELINE_PARAMS_PATH = os.environ["PIPELINE_PARAMS_PATH"] -UPDATE_PIPELINE_STATE_MACHINE_ARN = os.environ["UPDATE_PIPELINE_STATE_MACHINE_ARN"] - -branches_state_path = f"s3://{DATA_BUCKET_NAME}/{PIPELINE_STATE_PATH}" -pipeline_params_path = f"s3://{DATA_BUCKET_NAME}/{PIPELINE_PARAMS_PATH}" - -session = boto3.session.Session(region_name=AWS_REGION) -s3 = session.client("s3") -sfn = session.client("stepfunctions") - -release_pattern = r"^\d{2,3}0$" - - -def lambda_handler(event, context): - """Checks for new IMGT/HLA releases and triggers the update - pipeline if any are found""" - - logger.info(json.dumps(event)) - - try: - if "releases" in event: - # align, kir, mem_profile are booleans - execution_input_bool_keys = [ - "align", - "kir", - "mem_profile", - "use_existing_build", - "skip_load", - ] - if not all( - [ - isinstance(event[arg], bool) - for arg in execution_input_bool_keys - if arg in event - ] - ): - raise ValueError( - f'{", ".join(execution_input_bool_keys)} must be boolean values' - ) - - # conform booleans to the current argument format - event = {arg: str(val) for arg, val in event.items()} - - # limit is an integer - try: - # empty limit implies no limit - if event["limit"]: - if not isinstance(int(event["limit"]), int): - raise ValueError("limit must be an integer") - except ValueError: - raise ValueError("limit must be an integer") - - # release is a string that matches regex - if not all( - [ - is_valid_release(release, release_pattern) - for release in event["releases"].split(",") - ] - ): - raise ValueError( - f"releases must contains strings that match {release_pattern}" - ) - - logging.info(f"Reading parameters from event") - new_releases, params = parse_event(event) - - else: - # Load the previous repository state - logging.info(f"Reading parameters from file") - new_releases, params = parse_state( - branches_state_path, pipeline_params_path - ) - - if new_releases: - execution_input = [] - - for release in new_releases: - params_input = copy.deepcopy(params) - params_input["releases"] = release - params_input = {k.upper(): v for k, v in params_input.items()} - logger.info( - f"Running pipeline with these parameters:\n{json.dumps(params_input)}" - ) - execution_input.append(params_input) - - payload = {"input": execution_input} - - # TODO: include release number in execution identifier - response = sfn.start_execution( - stateMachineArn=UPDATE_PIPELINE_STATE_MACHINE_ARN, - input=json.dumps(payload), - ) - - # Update the config file - write_config(branches_state_path) - - return { - # TODO: add timestamp - "status": response["ResponseMetadata"]["HTTPStatusCode"], - "message": "Pipeline triggered", - "payload": payload, - } - - else: - # Update the config file - write_config(branches_state_path) - - return {"status": 200, "message": "No new releases found"} - except Exception as e: - logger.error(e) - return {"status": 500, "message": str(e)} - - -# Needed to serialize datetime objects in JSON responses -class DatetimeEncoder(json.JSONEncoder): - """ - Helps convert datetime objects to pure strings in AWS service API responses. Does not - convert timezone information. - - Extend `json.JSONEncoder`. - """ - - def default(self, obj): - try: - return super().default(obj) - except TypeError: - return str(obj) - - -def get_branches(owner, repo): - """Return a list of GitHub branches for the specified repository""" - - base_url = "https://api.github.com" - - # Endpoint - endpoint = f"/repos/{owner}/{repo}/branches?per_page=100" - - url = base_url + endpoint - - # Headers - headers = { - "Authorization": f"token {GITHUB_PERSONAL_ACCESS_TOKEN}", - "Content-Type": "application/json", - "Accept": "application/vnd.github.v3+json", - } - - response = requests.get(url) - branches = json.loads(response.content) - - return [branch["name"] for branch in branches] - - -def is_valid_release(branch, release_pattern=r"^\d{2,3}0$"): - """Returns True if the branch is a valid release, False if not""" - - # IMGT/HLA release format string - # Checks for a pattern corresponding to 3 digits followed by one zero, ie., 3460 - p = re.compile(release_pattern) - match = p.match(branch) - - if match: - return True - else: - return False - - -def get_releases(owner, repo): - """Filters repository branches for those that match the IMGT/HLA release format string""" - return list(filter(is_valid_release, get_branches(owner, repo))) - - -def write_config(path): - """Writes config file containing the current state of branches in - a GitHub repo""" - - branches_config = { - "timestamp": str(datetime.datetime.utcnow())[:-7], - "repository_url": f"https://github.com/{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}", - "releases": get_releases(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME), - } - - try: - response = s3.put_object( - Body=json.dumps(branches_config), - Bucket=DATA_BUCKET_NAME, - Key="/".join(path.split("/")[3:]), - ) - - if response["ResponseMetadata"]["HTTPStatusCode"] == 200: - logger.info(f"Wrote config file to {path}") - return - else: - logger.error( - f'Failed to write config file to {path}. HTTPStatusCode: {response["ResponseMetadata"]["HTTPStatusCode"]}' - ) - return - - except Exception as err: - raise err - - -def read_config(path): - """Reads config file containing the current state of branches in - a GitHub repo""" - - try: - response = s3.get_object( - Bucket=DATA_BUCKET_NAME, Key="/".join(path.split("/")[3:]) - ) - - if response["ResponseMetadata"]["HTTPStatusCode"] == 200: - logger.info(f"Read config file from {path}") - return json.loads(response["Body"].read().decode()) - else: - logger.error( - f'Failed to read config file to {path}. HTTPStatusCode: {response["ResponseMetadata"]["HTTPStatusCode"]}' - ) - return - - except Exception as err: - raise err - - -def check_new_releases(previous_state, current_state): - """Compares the previous repository state as a list of branches, with the current - repository state. Returns new branches if they are a valid IMGT/HLA release, None - if not.""" - - # Check if any branches have been added - new_branches_count = len(current_state) - len(previous_state) - branches_added = new_branches_count > 0 - - if branches_added: - logger.info( - f"Found {new_branches_count} new branches: {json.dumps(current_state[-new_branches_count:])}" - ) - - # Get the new branches - new_releases = sorted( - [ - int(release) - for release in list(set(current_state).difference(previous_state)) - ] - ) - previous_state_last_release = [int(previous_state[-1])] - - # Check that the last release of the previous state and the newly added releases all differ by 10 - elementwise_difference = list( - set( - np.diff( - [release for release in previous_state_last_release + new_releases] - ) - ) - ) - new_branches_are_valid_releases = ( - len(elementwise_difference) == 1 and elementwise_difference[0] == 10 - ) - - if new_branches_are_valid_releases: - return [str(release) for release in new_releases] - else: - logger.info("No new branches detected") - - return - - -def check_current_executions(state_machine_arn): - response = sfn.list_executions( - stateMachineArn=state_machine_arn, statusFilter="RUNNING" - ) - - # Extract executions - executions_arns = [ - execution["executionArn"] for execution in response["executions"] - ] - - releases_processing = [] - - for executions_arn in executions_arns: - response = sfn.describe_execution(executionArn=executions_arn) - - releases_processing = releases_processing + [ - params["releases"] for params in json.loads(response["input"]) - ] - - return releases_processing - - -def parse_event(event): - """Restructures the event and returns pipeline execution parameters""" - - new_releases = str(event["releases"]).replace(" ", "").split(",") - params = {k: v for k, v in event.items() if k != "releases"} - - return new_releases, params - - -def parse_state(state_path, params_path): - """""" - previous_state = read_config(state_path)["releases"] - - # Get the current repository state - current_state = get_releases(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME) - - # Log repository states - logger.info(f"Previous repository state:\n{json.dumps(previous_state)}") - logger.info(f"Current repository state:\n{json.dumps(current_state)}") - - # Compare the previous state with the current state - new_releases = check_new_releases(previous_state, current_state) - - # TODO: Describe current executions and make sure the release is not already being built - # releases_processing = set(check_current_executions(UPDATE_PIPELINE_STATE_MACHINE_ARN)) - - # Load the current pipeline params - params = read_config(params_path) - - return new_releases, params - - -if __name__ == "__main__": - import os - from pathlib import Path - - path = Path(__file__).parent / "event-use-existing-true.json" - with open(path, "r") as f: - event = json.load(f) - - lambda_handler(event, "") diff --git a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false-no-key.json b/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false-no-key.json deleted file mode 100644 index 9021f623..00000000 --- a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false-no-key.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "align": false, - "kir": false, - "limit": "", - "releases": "310", - "mem_profile": false, - "skip_load": true -} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false.json b/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false.json deleted file mode 100644 index 1d390e2f..00000000 --- a/gfe-db/pipeline/functions/invoke_pipeline/event-use-existing-false.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "align": false, - "kir": false, - "limit": "", - "releases": "310", - "mem_profile": false, - "use_existing_build": false -} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_pipeline/requirements.txt b/gfe-db/pipeline/functions/invoke_pipeline/requirements.txt deleted file mode 100644 index 3cb777ae..00000000 --- a/gfe-db/pipeline/functions/invoke_pipeline/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -urllib3~=1.26.18 -requests~=2.31.0 -numpy~=1.26.4 \ No newline at end of file diff --git a/gfe-db/pipeline/functions/invoke_update_pipeline/__init__.py b/gfe-db/pipeline/functions/invoke_update_pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_update_pipeline/app.py b/gfe-db/pipeline/functions/invoke_update_pipeline/app.py new file mode 100644 index 00000000..cd66e6dd --- /dev/null +++ b/gfe-db/pipeline/functions/invoke_update_pipeline/app.py @@ -0,0 +1,129 @@ +import os +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) + +import logging +from datetime import datetime +import json +from gfedbmodels.constants import ( + session, + pipeline, + database +) + +# set up logging +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +# Environment +AWS_REGION = os.environ["AWS_REGION"] + +# Boto3 Clients +ec2 = session.client("ec2", region_name=AWS_REGION) +states = session.client("stepfunctions", region_name=AWS_REGION) +sqs = session.client("sqs", region_name=AWS_REGION) + +# Get SSM Parameters +neo4j_database_instance_id = database.params.Neo4jDatabaseInstanceId +update_pipeline_state_machine_arn = pipeline.params.UpdatePipelineStateMachineArn +gfe_db_processing_queue_url = pipeline.params.GfeDbProcessingQueueUrl + +# Check that database is running, abort if not +# TODO State table should be updated to FAILED if database is not running, however this block needs to run inside the handler +response = ec2.describe_instance_status(InstanceIds=[neo4j_database_instance_id]) +if len(response["InstanceStatuses"]) > 0: + if response["InstanceStatuses"][0]["InstanceState"]["Name"] != "running": + raise Exception( + f"Instance {neo4j_database_instance_id} is not running, aborting..." + ) + else: + logger.info(f"Instance {neo4j_database_instance_id} is running") +else: + raise Exception(f"Instance {neo4j_database_instance_id} not found, aborting...") + + +def lambda_handler(event, context): + errors = 0 + execution_arns = [] + for record in event["Records"]: + try: + message = json.loads(record["body"]) + + # Include receipt handle in message to return to queue if step function fails + message["receipt_handle"] = record["receiptHandle"] + + logger.info( + f"Received message for version {message['version']} and commit {message['commit_sha']}" + ) + + payload = { + "input": message + } + response = states.start_execution( + stateMachineArn=update_pipeline_state_machine_arn, + name=payload["input"]["id"], # {version}_{commit_sha}_{YYYYMMDD_HHMMSS} + input=json.dumps(payload), + ) + + execution_arns.append(response["executionArn"]) + + # try: + # response = sqs.delete_message( + # QueueUrl=gfe_db_processing_queue_url, + # ReceiptHandle=record["receiptHandle"], + # ) + # logger.info(f"Message deleted from queue") + # except Exception as e: + # logger.error(f"Error deleting message from queue: {e}") + + except Exception as e: + import traceback + message = f'Error processing commit {message["commit_sha"]}: {e}\n{traceback.format_exc()}' + logger.error(message) + errors += 1 + continue + + return_msg = f'{len(event["Records"])-errors} of {len(event["Records"])} messages processed successfully, {errors} error(s)' + if errors > 0: + message = json.dumps({"message": return_msg, "execution_arns": execution_arns}) + logger.error(message) + logger.error(json.dumps(event)) + raise Exception(return_msg) + else: + message = json.dumps({"message": return_msg, "execution_arns": execution_arns}) + return { + "statusCode": 200, + "body": json.dumps({"message": return_msg, "execution_arns": execution_arns}), + } + +# # TODO move to CheckSourceUpdate +# def generate_execution_id(message: dict) -> str: +# """Generate an execution id for the state machine execution with format: +# {version}_{commit_sha}_{YYYYMMDD_HHMMSS} + +# Args: +# message (dict): Message from SQS queue + +# Returns: +# str: Execution id +# """ +# return "_".join( +# [ +# str(message["version"]), +# message["commit_sha"], +# datetime.utcnow().strftime("%y%m%d_%H%M%S"), +# ] +# ) + + +if __name__ == "__main__": + from pathlib import Path + + with open(Path(__file__).parent / "sqs-event.json", "r") as f: + event = json.load(f) + + lambda_handler(event, "") diff --git a/gfe-db/pipeline/functions/invoke_update_pipeline/requirements.txt b/gfe-db/pipeline/functions/invoke_update_pipeline/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/invoke_update_pipeline/sqs-event.json b/gfe-db/pipeline/functions/invoke_update_pipeline/sqs-event.json new file mode 100644 index 00000000..03119f37 --- /dev/null +++ b/gfe-db/pipeline/functions/invoke_update_pipeline/sqs-event.json @@ -0,0 +1,20 @@ +{ + "Records": [ + { + "messageId": "62eac214-84e3-4047-88a7-0a64a4cbed7b", + "receiptHandle": "AQEB44KYQWwCp5x0Q+OywMyKug9jAdoX5IRUa4fUKY+FMXgKtA9dMsjGzr06TndpXmrS/lra9WI40a+G/wc9zJ4dXij2BirX1xA7wiRYTg+QHGDkTY0QBOW13DecyEbifLqpBbeRLh6s2J+C2TdVjy1Gf8sGmlZUZg5TzrkQdQ/u2ezZNgDL/ALpsvPyejMzjqnASQSfJOhrjI68atyQdVZ5DZRwed/9CAcqHAvdS0Uv1j87OW9LEYt+ZINHYoYu4Sb18NBvIP2FQIcGql9q530HhjoYeUrP41glkRJqiL8HPrJhfvgdnVEzIzduvQkvjsX7V61C5VBSnQDa7vf/QS44LiqBhPkvkwuBDkI3yJ/qC20Vc02PQ5p1tpZctbNnliHKNl+jU01uinCH/SpC6SGHZipXB3hf+1vaFwG7YwgT/Pv1GVVu8uci8gCNbNiSRkV4", + "body": "{\"id\": \"3540_901b1b788a2ec2b4722c9672910ad880b29b368b_240107_194421\", \"version\": 3540, \"commit_sha\": \"901b1b788a2ec2b4722c9672910ad880b29b368b\", \"input_parameters\": {\"align\": false, \"kir\": false, \"mem_profile\": false, \"limit\": 1000, \"use_existing_build\": true, \"skip_load\": false}, \"s3_path\": \"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3540\"}", + "attributes": { + "ApproximateReceiveCount": "1", + "SentTimestamp": "1704656711976", + "SenderId": "AROA3ZNY5UDMUNE5WAE5K:gregory", + "ApproximateFirstReceiveTimestamp": "1704656712000" + }, + "messageAttributes": {}, + "md5OfBody": "1d53a934a4f027498387994f344b78dd", + "eventSource": "aws:sqs", + "eventSourceARN": "arn:aws:sqs:us-east-1:810526023897:nmdpf-gfe-db-pipeline-GfeDbProcessingQueue-DQDFHzl2h1id", + "awsRegion": "us-east-1" + } + ] +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/lcm_receive_message/__init__.py b/gfe-db/pipeline/functions/lcm_receive_message/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/lcm_receive_message/app.py b/gfe-db/pipeline/functions/lcm_receive_message/app.py new file mode 100644 index 00000000..29880342 --- /dev/null +++ b/gfe-db/pipeline/functions/lcm_receive_message/app.py @@ -0,0 +1,64 @@ +"""This function is invoked through by the LoadConcurrencyManager state machine when it is triggered by the GfeDbLoadQueueHasMessagesAlarm. +It polls the GfeDbLoadQueue for messages and invokes the LoadNeo4j state machine for each message. If no messages are found the state machine will check the alarm status +and repeat the polling process until the alarm is in OK state. +""" + +import os +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) + +import logging +import json +from gfedbmodels.constants import ( + session, + pipeline) + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +APP_NAME = os.environ["APP_NAME"] +STAGE = os.environ["STAGE"] +AWS_REGION = os.environ["AWS_REGION"] + +sqs = session.client("sqs", region_name=AWS_REGION) + +gfe_db_load_queue_url = pipeline.params.GfeDbLoadQueueUrl + +def lambda_handler(event, context): + + logger.info(json.dumps(event)) + + res = sqs.receive_message( + QueueUrl=gfe_db_load_queue_url, + MaxNumberOfMessages=1 + ) + + if "Messages" in res: + message = res["Messages"][0] + + # Format the message body as json + message['Body'] = json.loads(message['Body']) + + # change message visibility to 8 hours + sqs.change_message_visibility( + QueueUrl=gfe_db_load_queue_url, + ReceiptHandle=message["ReceiptHandle"], + VisibilityTimeout=28800 + ) + + else: + logger.info("No messages found") + return {} + + return message + + +if __name__ == "__main__": + from pathlib import Path + + event = json.loads((Path(__file__).parent / "event.json").read_text()) + lambda_handler(event, "") diff --git a/gfe-db/pipeline/functions/lcm_receive_message/event.json b/gfe-db/pipeline/functions/lcm_receive_message/event.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/gfe-db/pipeline/functions/lcm_receive_message/event.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/lcm_receive_message/requirements.txt b/gfe-db/pipeline/functions/lcm_receive_message/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/lcm_receive_message/sample-message.json b/gfe-db/pipeline/functions/lcm_receive_message/sample-message.json new file mode 100644 index 00000000..1ed4d93a --- /dev/null +++ b/gfe-db/pipeline/functions/lcm_receive_message/sample-message.json @@ -0,0 +1,22 @@ +{ + "Messages": [ + { + "MessageId": "5a33744c-dd4b-4a8a-a55a-8478137b6950", + "ReceiptHandle": "AQEBKH6PwnhAzIyXtqD6yujdWq5krcTA4djNX8YjJocCfa3ru3f6OM9OaDT+XkSiwYdetb7vpY3BMYcG3r0dkBIcVErCw7c2pRc//80BNxQDg9O1BeJDuaJeXErnUGOBcd+kZWo6J2JeXpx/gCGtPZH4D+8syJw1iMjolsZAKa+G5DkotBryyNC3JjIwpQb8yDQtLxWUOJCQDiwVLDR4pJfVq7JQwmnQNCKt22IkKE0Lkw/9weRmurMSCcX02LN89qM7CM9QQ62MvdNspfPArWenu6taUpSgn7Dbp/rxcQU6DeOkpYsQyJvG9sMYMth+z5prrDkiudpx6g+Fu49WKsGjyA==", + "MD5OfBody": "7a8446e46c789ab29c7b37440c2578dd", + "Body": "{\"input\":{\"id\":\"3350_4052371f8d68dc662d8c5b07377d9aebcfccc0b9_20240131_214043\",\"version\":3350,\"invocation_id\":\"ce22a924-9222-438b-b924-ea5e4ff74a43\",\"commit_sha\":\"4052371f8d68dc662d8c5b07377d9aebcfccc0b9\",\"input_parameters\":{\"align\":false,\"kir\":false,\"mem_profile\":false,\"limit\":1000,\"use_existing_build\":true,\"skip_load\":false},\"s3_path\":\"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350\",\"receipt_handle\":\"AQEBmoT0rR4qSe8UpSq0Z2/PR8ay23vFwO+jBhmh/lvWMiN9gnwLBccRUwhA9f6kG8K268VjPBcTlzZAghWHFYReZaKjU5VIsHfT+ltZcEF71sShsPA0zMDhxHPLTY4lxV3obQgviaFylvO3o1u5gJtPNIC7e70dDnqjpbFnKlfP1JEIH6dfZ2nnxYuyaqbkcN2SOyRkezatp9HQHu+cKGsL2Asom+Mvc1+FNh56SSuVuh5pFQsPkUAHZZ77uUaFTGDfkm2Cb4sCJn2njB8a77xFAZIL9ca07G9GRaAk6xvkoIKdjGvxMrazboVlTlewNZIgKTPBM6d70Et6dlTS6XeUUHdEfa1t3/u1Ux75ssthBC9heV6MXB0kQ+EdDnWVjbuvh+KSGZUR9tgtzGnpL1g6YR/wD90AxBB7g0H2usq8r6X7fXHyF63nDnXj2/39zRf6\"},\"validations\":{\"queries\":{\"pre\":{\"node_counts\":[{\"node\":\"GFE\",\"count\":1000},{\"node\":\"IPD_Accession\",\"count\":1000},{\"node\":\"IPD_Allele\",\"count\":1000},{\"node\":\"Sequence\",\"count\":1000},{\"node\":\"Feature\",\"count\":1087},{\"node\":\"Submitter\",\"count\":1}],\"has_ipd_allele_release_counts\":[{\"release_version\":3350,\"count\":1000}],\"ipd_accession_release_counts\":[{\"release_version\":\"3.35.0\",\"count\":1000}]}},\"check_existing_build\":{\"Contents\":[{\"ETag\":\"\\\"bea92fb773c3300a5bed48d52f654fd2\\\"\",\"Key\":\"data/3350/csv/all_cds.3350.csv\",\"LastModified\":\"2024-01-28T20:10:04Z\",\"Owner\":{\"DisplayName\":\"gclindsey+awstesting\",\"Id\":\"c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611\"},\"Size\":1182753,\"StorageClass\":\"STANDARD\"},{\"ETag\":\"\\\"11080ce94680a48657e790c1f1496d6e\\\"\",\"Key\":\"data/3350/csv/all_features.3350.csv\",\"LastModified\":\"2024-01-28T20:10:04Z\",\"Owner\":{\"DisplayName\":\"gclindsey+awstesting\",\"Id\":\"c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611\"},\"Size\":2394539,\"StorageClass\":\"STANDARD\"},{\"ETag\":\"\\\"a55b30240ee5107cd40378acd1bd3d8d\\\"\",\"Key\":\"data/3350/csv/all_groups.3350.csv\",\"LastModified\":\"2024-01-28T20:10:04Z\",\"Owner\":{\"DisplayName\":\"gclindsey+awstesting\",\"Id\":\"c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611\"},\"Size\":284170,\"StorageClass\":\"STANDARD\"},{\"ETag\":\"\\\"7a84b882607600972a18d6a7954fe757\\\"\",\"Key\":\"data/3350/csv/gfe_sequences.3350.csv\",\"LastModified\":\"2024-01-28T20:10:04Z\",\"Owner\":{\"DisplayName\":\"gclindsey+awstesting\",\"Id\":\"c7bf5cd354a38116b91f478b895cf40889f01091a7d26941d8a367727278b611\"},\"Size\":1710669,\"StorageClass\":\"STANDARD\"}],\"IsTruncated\":false,\"Marker\":\"\",\"MaxKeys\":1000,\"Name\":\"nmdpf-gfe-db-810526023897-us-east-1\",\"Prefix\":\"data/3350/csv/\"},\"build_outputs\":{\"release_version\":3350,\"details\":[{\"schema\":\"all_cds\",\"release\":3350,\"file_path\":\"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_cds.3350.csv\",\"cols\":[\"gfe_name\",\"bp_seq_id\",\"bp_sequence\",\"aa_seq_id\",\"aa_sequence\"],\"num_rows\":1000,\"created_utc\":\"2024-01-28T20:10:04.000Z\",\"details\":{\"is_valid_csv_filename\":true,\"is_valid_csv_header\":true,\"is_valid_csv_rows\":true},\"num_errors\":0,\"is_valid_csv\":true},{\"schema\":\"all_features\",\"release\":3350,\"file_path\":\"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_features.3350.csv\",\"cols\":[\"accession\",\"hash_code\",\"locus\",\"rank\",\"sequence\",\"term\",\"gfe_name\",\"allele_id\",\"hla_name\",\"imgt_release\"],\"num_rows\":7728,\"created_utc\":\"2024-01-28T20:10:04.000Z\",\"details\":{\"is_valid_csv_filename\":true,\"is_valid_csv_header\":true,\"is_valid_csv_rows\":true},\"num_errors\":0,\"is_valid_csv\":true},{\"schema\":\"all_groups\",\"release\":3350,\"file_path\":\"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/all_groups.3350.csv\",\"cols\":[\"gfe_name\",\"allele_id\",\"hla_name\",\"ard_id\",\"ard_name\",\"locus\",\"imgt_release\"],\"num_rows\":3000,\"created_utc\":\"2024-01-28T20:10:04.000Z\",\"details\":{\"is_valid_csv_filename\":true,\"is_valid_csv_header\":true,\"is_valid_csv_rows\":true},\"num_errors\":0,\"is_valid_csv\":true},{\"schema\":\"gfe_sequences\",\"release\":3350,\"file_path\":\"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3350/csv/gfe_sequences.3350.csv\",\"cols\":[\"gfe_name\",\"acc_name\",\"locus\",\"hla_name\",\"seq_id\",\"sequence\",\"length\",\"imgt_release\"],\"num_rows\":1000,\"created_utc\":\"2024-01-28T20:10:04.000Z\",\"details\":{\"is_valid_csv_filename\":true,\"is_valid_csv_header\":true,\"is_valid_csv_rows\":true},\"num_errors\":0,\"is_valid_csv\":true}],\"errors\":[],\"expected_artifacts\":[\"all_cds.3350.csv\",\"all_features.3350.csv\",\"all_groups.3350.csv\",\"gfe_sequences.3350.csv\"],\"is_valid_build\":true}},\"state\":{\"created_utc\":\"2024-01-26T19:18:26.296Z\",\"updated_utc\":\"2024-01-31T21:40:43.242Z\",\"repository\":{\"owner\":\"ANHIG\",\"name\":\"IMGTHLA\",\"url\":\"https://github.com/ANHIG/IMGTHLA\"},\"commit\":{\"sha\":\"4052371f8d68dc662d8c5b07377d9aebcfccc0b9\",\"date_utc\":\"2019-04-17T09:10:57.000Z\",\"message\":\"Merge pull request #174 from ANHIG/3360\\n\\n3360\",\"html_url\":\"https://github.com/ANHIG/IMGTHLA/commit/4052371f8d68dc662d8c5b07377d9aebcfccc0b9\"},\"execution\":{\"id\":\"3350_4052371f8d68dc662d8c5b07377d9aebcfccc0b9_20240131_214043\",\"invocation_id\":\"ce22a924-9222-438b-b924-ea5e4ff74a43\",\"version\":3350,\"status\":\"LOAD_IN_PROGRESS\",\"input_parameters\":{\"align\":false,\"kir\":false,\"mem_profile\":false,\"limit\":1000,\"use_existing_build\":false,\"skip_load\":false}}}}" + } + ], + "ResponseMetadata": { + "RequestId": "6c4359b4-a21e-5508-ab0a-078062b29b45", + "HTTPStatusCode": 200, + "HTTPHeaders": { + "x-amzn-requestid": "6c4359b4-a21e-5508-ab0a-078062b29b45", + "date": "Wed, 31 Jan 2024 21:41:00 GMT", + "content-type": "text/xml", + "content-length": "8459", + "connection": "keep-alive" + }, + "RetryAttempts": 0 + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/update_execution_state/__init__.py b/gfe-db/pipeline/functions/update_execution_state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/update_execution_state/app.py b/gfe-db/pipeline/functions/update_execution_state/app.py new file mode 100644 index 00000000..1da83ea9 --- /dev/null +++ b/gfe-db/pipeline/functions/update_execution_state/app.py @@ -0,0 +1,67 @@ +"""In progress""" +import os + +if __name__ != "app": + import sys + + # for dev, local path to gfe-db modules + # ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) + sys.path.append(os.environ["GFEDBMODELS_PATH"]) +import logging +import json +from gfedbmodels.types import ExecutionPayloadItem +from gfedbmodels.constants import session, pipeline +from gfedbmodels.utils import get_utc_now + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +dynamodb = session.resource("dynamodb") +table = dynamodb.Table(pipeline.params.GfeDbExecutionStateTableName) + + +def lambda_handler(event, context): + logger.info(json.dumps(event)) + # return + + try: + # validate input + execution_payload_item = ExecutionPayloadItem( + **json.loads(event["detail"]["input"])["input"] + ) + status = event["detail"]["status"] + + # update execution state + # composite key is commit.sha as commit__sha and execution.version as execution__version + table.update_item( + Key={ + "commit__sha": execution_payload_item.commit_sha, + "execution__version": execution_payload_item.version, + }, + UpdateExpression="SET #status = :status, #updated_utc = :updated_utc", + ExpressionAttributeNames={ + "#status": "execution__status", + "#updated_utc": "updated_utc", + }, + ExpressionAttributeValues={ + ":status": status, + ":updated_utc": get_utc_now(), + }, + ) + + return 0 + + except Exception as e: + import traceback + message = f"Error updating execution state: {e}\n{traceback.format_exc()}\n{json.dumps(event)}" + logger.error(message) + raise Exception(message) + + return 1 + + +if __name__ == "__main__": + from pathlib import Path + + event = json.loads((Path(__file__).parent / "event.json").read_text()) + lambda_handler(event, None) diff --git a/gfe-db/pipeline/functions/update_execution_state/event.json b/gfe-db/pipeline/functions/update_execution_state/event.json new file mode 100644 index 00000000..21e73b09 --- /dev/null +++ b/gfe-db/pipeline/functions/update_execution_state/event.json @@ -0,0 +1,34 @@ +{ + "version": "0", + "id": "2bf41e92-7e02-cd33-3739-48728b910cc7", + "detail-type": "Step Functions Execution Status Change", + "source": "aws.states", + "account": "810526023897", + "time": "2024-01-20T22:00:37Z", + "region": "us-east-1", + "resources": [ + "arn:aws:states:us-east-1:810526023897:execution:UpdatePipelineStateMachine-FQbLz4G0k6df:3510_5f2c562056f8ffa89aeea0631f2a52300ee0de17_20240120_220027" + ], + "detail": { + "executionArn": "arn:aws:states:us-east-1:810526023897:execution:UpdatePipelineStateMachine-FQbLz4G0k6df:3510_5f2c562056f8ffa89aeea0631f2a52300ee0de17_20240120_220027", + "stateMachineArn": "arn:aws:states:us-east-1:810526023897:stateMachine:UpdatePipelineStateMachine-FQbLz4G0k6df", + "name": "3510_5f2c562056f8ffa89aeea0631f2a52300ee0de17_20240120_220027", + "status": "ABORTED", + "startDate": 1705788030095, + "stopDate": 1705788037205, + "input": "{\"input\": {\"id\": \"3510_5f2c562056f8ffa89aeea0631f2a52300ee0de17_20240120_220027\", \"version\": 3510, \"commit_sha\": \"5f2c562056f8ffa89aeea0631f2a52300ee0de17\", \"input_parameters\": {\"align\": false, \"kir\": false, \"mem_profile\": false, \"limit\": 1000, \"use_existing_build\": false, \"skip_load\": false}, \"s3_path\": \"s3://nmdpf-gfe-db-810526023897-us-east-1/data/3510\", \"receipt_handle\": \"AQEBCu1015BMxXvjWE9eeTSzDq2RQHItl3gQV+zkFX86UbCgNTugZEfrMM5KGk4/H38MyKA6KKPIoHPB8oSh4p8DaXQWbo0ejGY39q/INN3dpPHO4Xyw8RlRXfIZS8dUcMqv3WY5mfXiTrGIxviCh9sMQbjK6YXD8CnrIfYIPsnDNEhJJlVlapIP+dPSZJpJG6DLmUi4lddSrRTJiDSJT3dT8FQuzhH4GzVTsdG+HdlOPdIFv+YupNJGoXif2Jx+g3cLbhWeYcppBlbKuqUG2nozi0pdA7jhP9zK0KXyWCuD2jun/tqMllXYV5RJPwpa4Xuu8+6L8UlGGugH1k8nNJ9AcAR3awGYfQE9LQGzJZCJcq0y+2I2g7v36jvrGR5e4ZsRydiIfhUVgfLQ7yhkD9MZuAmCvqGkMFGoUnv7zYEALr7hwf87i1dvV+5vsTFJEVBq\"}}", + "output": null, + "stateMachineVersionArn": null, + "stateMachineAliasArn": null, + "redriveCount": 0, + "redriveDate": null, + "redriveStatus": "REDRIVABLE", + "redriveStatusReason": null, + "inputDetails": { + "included": true + }, + "outputDetails": null, + "error": null, + "cause": null + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/functions/update_execution_state/requirements.txt b/gfe-db/pipeline/functions/update_execution_state/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/validate_build_output/__init__.py b/gfe-db/pipeline/functions/validate_build_output/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/functions/validate_build_output/app.py b/gfe-db/pipeline/functions/validate_build_output/app.py index 30159f5d..3270a75f 100644 --- a/gfe-db/pipeline/functions/validate_build_output/app.py +++ b/gfe-db/pipeline/functions/validate_build_output/app.py @@ -30,8 +30,6 @@ def lambda_handler(event, context): """Validates the build output artifacts against the original execution input object.""" logger.info(json.dumps(event)) - - execution_start_time = datetime.strptime(event['execution_start_time'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=tz.tzutc()) # TODO get the expected input from execution context and validate against this # TODO Remove this and use the output of validation against the expected input @@ -40,7 +38,7 @@ def lambda_handler(event, context): # expected input is the execution input # execution_input = event['execution_context']['Execution']['Input']['input'] - release = event['input']['RELEASES'] + release = event['input']['version'] # errors for all release builds used only for logging, not used for validation logic errors = [] @@ -61,7 +59,7 @@ def lambda_handler(event, context): # Validate that the S3 prefix exists and has data try: csv_file_objs = list_s3_objects(data_bucket_name, csv_dir) - except KeyError as e: + except KeyError: error_msg = f"CSV directory does not exist: {csv_dir}" logger.error(error_msg) release_report["errors"].append(error_msg) @@ -69,7 +67,6 @@ def lambda_handler(event, context): # reports.append(release_report) errors.append(error_msg) # continue - raise e # Validate that all expected files are present if set(csv_file_objs.keys()) != set(release_report["expected_artifacts"]): @@ -88,6 +85,7 @@ def lambda_handler(event, context): obj["details"] = {} # # Note: the state machine can now use existing CSV files as input, so the timestamp validation is no longer needed + # TODO perform conditionally only if use_existing_csv is False # # Validate the file's timestamp is after the execution start time # obj["details"]["is_valid_csv_timestamp"] = obj['created_utc'] > execution_start_time # if not obj["details"]["is_valid_csv_timestamp"]: @@ -152,9 +150,6 @@ def lambda_handler(event, context): logger.error(error_msg) payload = { - "execution_id": event['execution_id'], - "execution_start_time": event['execution_start_time'], - "input": event['input'], **release_report } diff --git a/gfe-db/pipeline/functions/validate_build_output/error-event.json b/gfe-db/pipeline/functions/validate_build_output/error-event.json index 7dcb54e8..0fa5bf52 100644 --- a/gfe-db/pipeline/functions/validate_build_output/error-event.json +++ b/gfe-db/pipeline/functions/validate_build_output/error-event.json @@ -1,13 +1,20 @@ { - "execution_id": "arn:aws:states:us-east-1:810526023897:execution:UsePublicSubnetConditionUpdatePipelineStateMachine-vKSrXyQEhEW3:d4f6c52e-542b-410b-be8f-ac75f3c7b783", + "execution_id": "arn:aws:states:us-east-1:810526023897:execution:UpdatePipelineStateMachine-b50xUAEHaEFZ:3560_b5542c829f8666f309307fd60d0fc7c46a71b4c6_20240512_215646", "input": { - "ALIGN": "False", - "KIR": "False", - "LIMIT": "1000", - "MEM_PROFILE": "False", - "USE_EXISTING_BUILD": "False", - "SKIP_LOAD": "False", - "RELEASES": "3450" + "id": "3560_b5542c829f8666f309307fd60d0fc7c46a71b4c6_20240512_215646", + "version": 3560, + "invocation_id": "4ecf0f40-d308-4bf1-83fb-2d0e542bf345", + "commit_sha": "b5542c829f8666f309307fd60d0fc7c46a71b4c6", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 10, + "use_existing_build": false, + "skip_load": false + }, + "s3_path": "s3://nmdpff-gfe-db-810526023897-us-east-1/data/3560", + "receipt_handle": "AQEBi23GZuMPvrFqjVy0N8AV279qtmiUg1murJ5RsNDRlTwHCoXiy0TzrMcYfay8pTjprdSy++qbr8vOh5LCHrVup2H1b6w76XmqeBWucz+Uyz0pYfwRbgxVTr5nqFpuldKz85KTeW+v44Xd++Pr+9xwP7SBRDCtPKCEqepzWBkJVt8X/D6WdZGptoAQpxDeZEaWdZllLI8Zb5Dvcnall+F0eznzTrm7lbIY9zWAmQliKNzhrdJT3tOTERu0vY2kvE5i5jlqEQSpx4ajvtaiLMvN765PtKu9bCvxo85eLw/35gk6TTqCrIsDOTyaBdc+PDkWT6/wDROuppXNC/QoKiZ1hg==" }, - "execution_start_time": "2023-10-29T18:53:54.523Z" + "execution_start_time": "2024-05-12T21:57:00.815Z" } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/validate_build_output/event.json b/gfe-db/pipeline/functions/validate_build_output/event.json index a1789947..7a7572d0 100644 --- a/gfe-db/pipeline/functions/validate_build_output/event.json +++ b/gfe-db/pipeline/functions/validate_build_output/event.json @@ -1,11 +1,18 @@ { + "execution_id": "arn:aws:states:us-east-1:810526023897:execution:UpdatePipelineStateMachine-N2tfp1Efjryu:3500_50b790037030d958b662085c3f4cf34ba72a32ec_240102_230217", "input": { - "ALIGN": "False", - "KIR": "False", - "LIMIT": "100", - "MEM_PROFILE": "False", - "RELEASES": "320" + "version": 3500, + "commit_sha": "50b790037030d958b662085c3f4cf34ba72a32ec", + "input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": 1000, + "use_existing_build": false, + "skip_load": false + }, + "s3_path": "s3://nmdpf-gfe-db-810526023897-us-east-1/data/3500", + "receipt_handle": "AQEBEtdJKEOIhBCO24mxSnC8Q7RP8bFbXbsUuwd9Ix3vgj2QwRnLOPtvIaBlKaZQFaUU9qj9VitZBZXv0wTHkleavmWo7WnhtfsKFyCmB8I/1wAise2Lzo0y6GisRyKSDMJd+tmtZUgX2tfPEQ8cx4N6hHvlWXW2XhoEH0h5fWtJj2pCKUcnqdJQcinSVoLrfGZpZpYnZ+fhiDqcnBsP3lv6XcUY+2MynHK1QkadxhM2JXipAj5qb7lVS0lCSiWfTaHcph1RExmO4k/QNZBJGr41JVmqPONqO9Dp8ryyXIpavPLvP/uK6OAkppXbQVM5VqNEEx7Njqd2e9C2NoYXKCrGToigN62rEOWzL72L/rFHHXMGXhiCJnd7J67g8Ni5ejDI6t6zkbMlvdLTnegLtVt91MwW4+Hfpwr2dtZJwpsu0zPa0qYF6CaliYanA8CJVzLI" }, - "execution_id": 1, - "execution_start_time": "2023-08-30T23:46:01.477Z" -} \ No newline at end of file + "execution_start_time": "2024-01-02T23:02:17.533Z" + } \ No newline at end of file diff --git a/gfe-db/pipeline/functions/validate_build_output/requirements.txt b/gfe-db/pipeline/functions/validate_build_output/requirements.txt index d7f13cf0..feb8b2c1 100644 --- a/gfe-db/pipeline/functions/validate_build_output/requirements.txt +++ b/gfe-db/pipeline/functions/validate_build_output/requirements.txt @@ -1,2 +1,2 @@ -polars~=0.20.19 -s3fs~=2024.3.1 \ No newline at end of file +polars~=0.20.25 +s3fs~=2024.3.1 diff --git a/gfe-db/pipeline/jobs/Makefile b/gfe-db/pipeline/jobs/Makefile index 06311201..042a4c6d 100644 --- a/gfe-db/pipeline/jobs/Makefile +++ b/gfe-db/pipeline/jobs/Makefile @@ -36,10 +36,13 @@ delete.ecr.jobs.build: [ "$$ecr_images" = "[]" ] && \ echo "No images to delete in $${ECR_BASE_URI}/$${BUILD_REPOSITORY_NAME}" && exit 0 || \ echo "Deleting images in $${ECR_BASE_URI}/$${BUILD_REPOSITORY_NAME}" && \ - aws ecr batch-delete-image \ + res=$$(aws ecr batch-delete-image \ --region "$${AWS_REGION}" \ --repository-name "$${BUILD_REPOSITORY_NAME}" \ - --image-ids "$$ecr_images" 2>&1 | tee -a $$CFN_LOG_PATH || true + --image-ids "$$ecr_images") && \ + echo $$res | jq -r && \ + echo "\033[0;34mSuccessfully deleted images in $${ECR_BASE_URI}/$${BUILD_REPOSITORY_NAME}\033[0m" || \ + echo "\033[0;31mFailed to delete images in $${ECR_BASE_URI}/$${BUILD_REPOSITORY_NAME}\033[0m" ############# # Helpers # diff --git a/gfe-db/pipeline/jobs/build/Dockerfile b/gfe-db/pipeline/jobs/build/Dockerfile index 68d345c2..1c4951c5 100644 --- a/gfe-db/pipeline/jobs/build/Dockerfile +++ b/gfe-db/pipeline/jobs/build/Dockerfile @@ -5,6 +5,7 @@ RUN apt update && \ bc \ curl \ git \ + jq \ unzip \ && pip3 install --upgrade pip \ && apt-get clean diff --git a/gfe-db/pipeline/jobs/build/run.sh b/gfe-db/pipeline/jobs/build/run.sh index 2b65ba50..bcbb35cd 100755 --- a/gfe-db/pipeline/jobs/build/run.sh +++ b/gfe-db/pipeline/jobs/build/run.sh @@ -1,71 +1,151 @@ #!/bin/bash +# # Exit immediately if a command exits with a non-zero status +# set -e + START_EXECUTION=$SECONDS -export ROOT="$(dirname "$(dirname "$0")")" +# export ROOT="$(dirname "$(dirname "$0")")" +export ROOT="$(dirname "$0")" export BIN_DIR=$ROOT/scripts export SRC_DIR=$ROOT/src -export DATA_DIR=$ROOT/../data +export DATA_DIR=$ROOT/data export LOGS_DIR=$ROOT/logs -# Check for environment variables -if [[ -z "${GFE_BUCKET}" ]]; then - echo "GFE_BUCKET not set. Please specify an S3 bucket." - exit 1 -fi +get_download_url() { + owner="$1" + repo="$2" + asset_path="$3" + commit_sha="$4" -if [[ -z "${RELEASES}" ]]; then - echo "RELEASES not set. Please specify the release versions to load." - exit 1 -fi + base_url="https://api.github.com" + endpoint="/repos/${owner}/${repo}/contents/${asset_path}" + url="${base_url}${endpoint}" -if [[ -z "${ALIGN}" ]]; then - echo "ALIGN not set" - ALIGN=False -fi + # # Authorization header + # auth_header="Authorization: token ${GITHUB_PERSONAL_ACCESS_TOKEN}" -if [[ -z "${KIR}" ]]; then - echo "KIR not set" - KIR=False -fi + # Content-Type header + content_type_header="Content-Type: application/json" + + # Accept header + accept_header="Accept: application/vnd.github.v3+json" + + # X-GitHub-Api-Version header + x_github_api_version_header="X-GitHub-Api-Version: 2022-11-28" + + # GET request with headers and ref parameter + # response=$(curl -s -H "${auth_header}" -H "${content_type_header}" -H "${accept_header}" -H "${x_github_api_version_header}" "${url}?ref=${commit_sha}") + # echo "${url}?ref=${commit_sha}" + response=$(curl -s -H "${content_type_header}" -H "${accept_header}" -H "${x_github_api_version_header}" "${url}?ref=${commit_sha}") + + # catch errors if .download_url is missing from response + if [ "$(echo "${response}" | jq -r '.download_url')" = "null" ]; then + echo "ERROR: `download_url` is null for asset: ${asset_path}, please check the asset path." + # exit 1 + elif [ "$(echo "${response}" | jq -r '.download_url')" = "" ]; then + echo "ERROR: `download_url` is empty for asset: ${asset_path}, please check the asset path." + # exit 1 + fi + + # Print the response + echo "${response}" | jq -r '.download_url' + +} + +# Takes the download url and downloads the asset +get_asset() { + download_url="$1" + asset_path="$2" + unzip_path="$3" -if [[ -z "${MEM_PROFILE}" ]]; then - echo "MEM_PROFILE not set" - MEM_PROFILE=False + echo "INFO: Received download url: $download_url" + echo "INFO: Downloading asset to: $asset_path" + + # Download the asset + response=$(curl -sSL -o "${asset_path}" "${download_url}") + + # check that the file was downloaded + if [ ! -f "${asset_path}" ]; then + echo "ERROR: Failed to download asset. File not found: ${asset_path}" + echo $response jq -r + # exit 1 + fi + + echo "INFO: Successfully downloaded asset ${asset_path}" + + # unzip if unzip_path is provided + if [ -n "$unzip_path" ]; then + echo "INFO: Unzipping asset to: $unzip_path" + unzip "${asset_path}" -d "${unzip_path}" -x "__MACOSX/*" + mv -f "$DATA_DIR/$version/hla.dat" "$DATA_DIR/$version/hla.$version.dat" + rm -f "$DATA_DIR/$version/hla.$version.dat.zip" + fi + +} + +# TODO use receive-message from SQS instead environment variables +# Using environment variables requires a job to deploy for *every* release version +if [[ -z "${EVENT}" ]]; then + echo "ERROR: No event found. Exiting..." + exit 1 +else + echo "INFO: Found event" + echo "INFO: $EVENT" fi -echo "Found environment variables:" -echo -e "GFE_BUCKET: $GFE_BUCKET\nRELEASES: $RELEASES\nALIGN: $ALIGN\nKIR: $KIR\nMEM_PROFILE: $MEM_PROFILE\nLIMIT: $LIMIT" +# parse event +version=$(echo "$EVENT" | jq -r '.state.execution.version') +commit_sha=$(echo "$EVENT" | jq -r '.state.commit.sha') +repository_owner=$(echo "$EVENT" | jq -r '.state.repository.owner') +repository_name=$(echo "$EVENT" | jq -r '.state.repository.name') +align=$(echo "$EVENT" | jq -r '.state.execution.input_parameters.align') +kir=$(echo "$EVENT" | jq -r '.state.execution.input_parameters.kir') +mem_profile=$(echo "$EVENT" | jq -r '.state.execution.input_parameters.mem_profile') +limit=$(echo "$EVENT" | jq -r '.state.execution.input_parameters.limit') +s3_path=$(echo "$EVENT" | jq -r '.input.s3_path') + +# Refactor the above variable validations into a for loop +for var in version commit_sha align kir mem_profile limit repository_owner repository_name s3_path; do + if [[ -z "${!var}" ]] || [[ "${!var}" == "null" ]]; then + echo "ERROR: \`$var\` not set. Please specify a value." + exit 1 + fi + echo "$var: ${!var}" +done -# Check limit -if [[ -z "${LIMIT}" ]]; then - echo "No limit set, building GFEs for all alleles" +if [[ "${limit}" == "-1" ]] || [[ -z "${limit}" ]]; then + echo "INFO: No limit set, building GFEs for all alleles" +elif [[ "${limit}" =~ ^[0-9]+$ ]] && [[ "${limit}" -gt 0 ]]; then + echo "INFO: Build is limited to $limit alleles" else - echo "Build is limited to $LIMIT alleles" + echo "ERROR: Invalid limit specified. Please specify either a positive integer or -1 for no limit." + exit 1 fi +echo "INFO: Found environment variables" + # Check if data directory exists +# TODO: get full path for each if [ ! -d "$DATA_DIR" ]; then - echo "Creating new directory in root: $DATA_DIR" + echo "INFO: Creating new directory in root: $DATA_DIR" mkdir -p "$DATA_DIR" else - # TODO: get full path - echo "Data directory: $DATA_DIR" + echo "INFO: Data directory: $DATA_DIR" fi # Check if logs directory exists if [ ! -d "$LOGS_DIR" ]; then - # TODO: get full path - echo "Creating logs directory: $LOGS_DIR" + echo "INFO: Creating logs directory: $LOGS_DIR" mkdir -p "$LOGS_DIR" else - # TODO: get full path - echo "Logs directory: $LOGS_DIR" + echo "INFO: Logs directory: $LOGS_DIR" fi +# TODO test memory profiling for build job # Memory profiling -if [ "$MEM_PROFILE" == "True" ]; then - echo "Memory profiling is set to $MEM_PROFILE." +if [ "$mem_profile" == "true" ]; then + echo "INFO: Memory profiling is set to $mem_profile." MEM_PROFILE_FLAG="-p" touch "$LOGS_DIR/mem_profile_agg.txt" touch "$LOGS_DIR/mem_profile_diff.txt" @@ -73,17 +153,17 @@ else MEM_PROFILE_FLAG="" fi -# Load KIR data -if [ "$KIR" == "True" ]; then - echo "Loading KIR = $KIR" +# Load kir data +if [ "$kir" == "true" ]; then + echo "INFO: Loading kir = $kir" KIRFLAG="-k" else KIRFLAG="" fi # Load alignments data -if [ "$ALIGN" == "True" ]; then - echo "Loading alignments..." +if [ "$align" == "true" ]; then + echo "INFO: Loading alignments..." ALIGNFLAG="-a" sh "$BIN_DIR/get_alignments.sh" else @@ -92,89 +172,86 @@ fi # Check for FEATURE_SERVICE_URL if [[ -z "${FEATURE_SERVICE_URL}" ]]; then - echo "No FEATURE_SERVICE_URL set, building GFEs with default feature service." + echo "ERROR: No FEATURE_SERVICE_URL set, building GFEs with default feature service." else - echo "Using Feature Service: ${FEATURE_SERVICE_URL}" + echo "INFO: Using Feature Service: ${FEATURE_SERVICE_URL}" fi # Build csv files RELEASES=$(echo "${RELEASES}" | sed s'/"//'g | sed s'/,/ /g') +# exit 1 # TODO test state machine error handling -for release in ${RELEASES}; do - - release=$(echo "$release" | sed s'/,//g') - echo "Processing release: $release" - - # Check if data directory exists - if [ ! -d "$DATA_DIR/$release/csv" ]; then - # TODO: get full path - echo "Creating new directory in root: $DATA_DIR/$release/csv..." - mkdir -p "$DATA_DIR/$release/csv" - else - # TODO: get full path - echo "CSV directory: $DATA_DIR/$release/csv" - fi - - # Check if DAT file exists - if [ -f "$DATA_DIR/$release/hla.$release.dat.zip" ]; then - echo "DAT file for release $release already exists" - else - echo "Downloading DAT file for release $release..." - # Should be environment variable - # https://github.com/ANHIG/IMGTHLA/raw/Latest/hla.dat.zip - imgt_hla_raw_url='https://github.com/ANHIG/IMGTHLA/raw' - echo "Downloading $imgt_hla_raw_url/$release/hla.dat.zip to $DATA_DIR/$release/hla.$release.dat.zip" - curl -SL "$imgt_hla_raw_url/$release/hla.dat.zip" > "$DATA_DIR/$release/hla.$release.dat.zip" - if [ $? -ne 0 ] || [ ! -s "$DATA_DIR/$release/hla.$release.dat.zip" ]; then - echo "Failed to download or empty file: $DATA_DIR/$release/hla.$release.dat.zip" - exit 1 - fi - unzip "$DATA_DIR/$release/hla.$release.dat.zip" -d "$DATA_DIR/$release" - mv "$DATA_DIR/$release/hla.dat" "$DATA_DIR/$release/hla.$release.dat" - fi - - # Builds CSV files - python3 "$SRC_DIR"/app.py \ - -o "$DATA_DIR/$release/csv" \ - -r "$release" \ - $KIRFLAG \ - $ALIGNFLAG \ - $MEM_PROFILE_FLAG \ - -v \ - -l $LIMIT \ - -u $FEATURE_SERVICE_URL - build_exit_status=$? - echo "Build exit status (0: SUCCESS, 1:CRITICAL, 2:WARNING): $build_exit_status" +echo "INFO: Processing release version: $version" + +# Check if data directory exists +# TODO: get full path for each +if [ ! -d "$DATA_DIR/$version/csv" ]; then + echo "INFO: Creating new directory in root: $DATA_DIR/$version/csv" + mkdir -p "$DATA_DIR/$version/csv" +else + echo "INFO: CSV directory: $DATA_DIR/$version/csv" +fi + +# Check if DAT file exists +if [ -f "$DATA_DIR/$version/hla.$version.dat" ]; then + echo "INFO: DAT file for release $version already exists" +else + + # download_url works for all releases including 3440 and earlier + echo "INFO: Fetching DAT file for release $version..." + download_url="$(get_download_url "$repository_owner" "$repository_name" "hla.dat.zip" "$commit_sha")" + get_asset "$download_url" "$DATA_DIR/$version/hla.$version.dat.zip" "$DATA_DIR/$version" +fi + +# Builds CSV files +# TODO booleans for kir, align, mem_profile are lower case, limit is now -1 instead of none +# TODO implement s3_path +python3 "$SRC_DIR"/app.py \ + -o "$DATA_DIR/$version/csv" \ + -r "$version" \ + $KIRFLAG \ + $ALIGNFLAG \ + $MEM_PROFILE_FLAG \ + -v \ + -l $limit \ + -u $FEATURE_SERVICE_URL - # Notify missing alleles with exit code 2 (warning for missing data but not fatal) - if [ $build_exit_status -eq 2 ]; then - echo "WARNING: Some alleles failed to build, please see logs for error messages" - fi +build_exit_status=$? +echo "INFO: Build exit status (0: SUCCESS, 1:CRITICAL, 2:WARNING): $build_exit_status" - # Fail for any exit code other than 0 or 2 - if [ $build_exit_status -ne 0 ] && [ $build_exit_status -ne 2 ]; then - echo "CRITICAL: Build failed, please see logs for error messages" - exit 1 - fi +# Notify missing alleles +if [ $build_exit_status -eq 2 ]; then +echo "WARNING: Some alleles failed to build, please see logs for error messages" +fi - # TODO: Use this S3 hierarchy: root/release/csv | logs - echo -e "Uploading CSVs to s3://$GFE_BUCKET/data/$release/csv/:\n$(ls $DATA_DIR/$release/csv/)" - aws s3 --recursive cp "$DATA_DIR/$release/csv/" s3://$GFE_BUCKET/data/$release/csv/ > "$LOGS_DIR/s3Copy$$LOG_FILE" - mv "$LOGS_DIR/gfeBuildLogs.txt" "$LOGS_DIR/gfeBuildLogs.$release.txt" - mv "$LOGS_DIR/s3Copy$$LOG_FILE" "$LOGS_DIR/s3CopyLog.$release.txt" +# fail for any exit code other than 0 or 2. 2 is a warning for missing data but not fatal. +if [ $build_exit_status -ne 0 ] && [ $build_exit_status -ne 2 ]; then +echo "CRITICAL: Build failed, please see logs for error messages" +exit 1 +fi - if [ "$MEM_PROFILE" == "True" ]; then - mv "$LOGS_DIR/mem_profile_agg.txt" "$LOGS_DIR/mem_profile_agg.$release.txt" - mv "$LOGS_DIR/mem_profile_diff.txt" "$LOGS_DIR/mem_profile_diff.$release.txt" - fi +# TODO: Use this S3 hierarchy: root/release/data/csv | logs +echo -e "INFO: Uploading data to s3://$GFE_BUCKET/data/$version" +res=$(aws s3 cp --recursive "$DATA_DIR/$version/" "s3://$GFE_BUCKET/data/$version/") + +echo $res + +if [ "$mem_profile" == "true" ]; then + mv "$LOGS_DIR/mem_profile_agg.txt" "$LOGS_DIR/mem_profile_agg.$version.txt" + mv "$LOGS_DIR/mem_profile_diff.txt" "$LOGS_DIR/mem_profile_diff.$version.txt" +fi + +echo -e "INFO: Uploading logs to s3://$GFE_BUCKET/logs/$version" +aws s3 --recursive cp "$LOGS_DIR/" s3://$GFE_BUCKET/logs/pipeline/build/$version/logs/ - echo -e "Uploading logs to s3://$GFE_BUCKET/logs/$release/:\n$(ls $LOGS_DIR/)" - aws s3 --recursive cp "$LOGS_DIR/" s3://$GFE_BUCKET/logs/pipeline/build/$release/ > "$LOGS_DIR/s3CopyLog.Local.txt" -done END_EXECUTION=$(( SECONDS - $START_EXECUTION )) -echo "Finished in $END_EXECUTION seconds" -exit 0 +echo "INFO: Finished in $END_EXECUTION seconds" # For debugging to keep the build server running -# sleep 1h +if [ "$DEBUG" == "true" ]; then + echo "INFO: DEBUG mode is set to $DEBUG. Sleeping..." + while true; do sleep 1000; done +fi + +exit 0 \ No newline at end of file diff --git a/gfe-db/pipeline/jobs/build/src/app.py b/gfe-db/pipeline/jobs/build/src/app.py index ee94f214..b91433c9 100755 --- a/gfe-db/pipeline/jobs/build/src/app.py +++ b/gfe-db/pipeline/jobs/build/src/app.py @@ -565,6 +565,7 @@ def process_allele(allele, alignments_dict, csv_path=None): #data_dir = f'{data_dir}/{dbversion}' # data_dir = os.path.dirname(__file__) + f"/../data/{dbversion}" data_dir = os.environ["DATA_DIR"] + f"/{dbversion}" + logger.info(f"Found data directory: {data_dir}") # Load alignments data if align: diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/__init__.py b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/__init__.py new file mode 100644 index 00000000..0720e82c --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/__init__.py @@ -0,0 +1,2 @@ +# from .utils import * +# from .ingest import * \ No newline at end of file diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/constants.py b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/constants.py new file mode 100644 index 00000000..aa3d6304 --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/constants.py @@ -0,0 +1,26 @@ +import os +import logging +from awsparameters import AppConfig +from awsparameters.manager import SessionManager + +# Logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Environment variables +APP_NAME = os.environ["APP_NAME"] +STAGE = os.environ["STAGE"] +AWS_REGION = os.environ["AWS_REGION"] + +session = SessionManager(region_name=AWS_REGION) +session.get_client("ssm") + +# TODO parameterize paths or use a consolidated mapping for all layers +infra_config_path = f"/{APP_NAME}/{STAGE}/{AWS_REGION}/GfedbInfrastructureParamMappings" +infra = AppConfig(mappings_path=infra_config_path, boto3_session=session) + +pipeline_config_path = f"/{APP_NAME}/{STAGE}/{AWS_REGION}/GfedbPipelineParamMappings" +pipeline = AppConfig(mappings_path=pipeline_config_path, boto3_session=session) + +database_config_path = f"/{APP_NAME}/{STAGE}/{AWS_REGION}/GfedbDatabaseParamMappings" +database = AppConfig(mappings_path=database_config_path, boto3_session=session) diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/ingest.py b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/ingest.py new file mode 100644 index 00000000..f5d7ed71 --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/ingest.py @@ -0,0 +1,292 @@ +import os +import logging +from typing import List, Dict, Union +import multiprocessing +from concurrent.futures import ThreadPoolExecutor, as_completed +from requests.exceptions import HTTPError +from time import sleep +from .types import ( + SourceConfig, + RepositoryConfig, + TargetMetadataConfig, + Commit, + ExecutionStateItem, + ExecutionDetailsConfig, +) +from .utils import ( + cache_pickle, + read_s3_json, + sort_execution_state_items, + filter_nulls, + get_repo_asset, + find_text, +) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +# TODO use method SourceConfig().from_s3(bucket, key) instead +def read_source_config(s3_client, bucket, key): + data = read_s3_json(s3_client, bucket, key) + return SourceConfig(**data) + + +def process_execution_state_item( + timestamp: str, + commit: Dict[str, str], + repository_config: RepositoryConfig, + target_metadata_config: TargetMetadataConfig, + token: str = None, + limit: int = None, +) -> Dict[str, str]: + errors = 0 + sha = commit["sha"] + + for config in target_metadata_config.items: + try: + logger.info( + f"Getting release version for sha {sha} from {config.asset_path}" + ) + release_version = get_release_version_for_commit( + commit=commit, + owner=repository_config.owner, + repo=repository_config.name, + token=token, + asset_path=config.asset_path, + metadata_regex=config.metadata_regex, + ) + logger.info(f"Found release version {release_version} ({sha})") + + result = ( + None, + { + "created_utc": timestamp, + "repository": repository_config, + "commit": Commit(**commit), + "execution": ExecutionDetailsConfig( + version=release_version, + status="NOT_PROCESSED", + date_utc=None, + input_parameters=None, + ), + }, + ) + + except HTTPError as e: + # This is because Allelelist.txt for certain commits doesn't contain the release version or name + # Need to find another file that indicates the release version should be small + errors += 1 + logger.info(f"Commit {sha} failed: {e} for {config.asset_path}") + + # Throw error if all possible asset paths have been tried + if errors == len(target_metadata_config.items): + logger.error(f"Max errors reached, this sha must be skipped: {sha}") + + # return the sha if all asset paths have been tried + result = (sha, None) + return result + else: + continue + + # return error count and increment outside this function + + # TODO deserialize to ExecutionStateItem, use as method + return result + + +def parallel_process_execution_state_items( + timestamp: str, + commits: List[Dict[str, str]], + repository_config: RepositoryConfig, + target_metadata_config: TargetMetadataConfig, + token: str = None, + limit: int = None, +): + execution_state_items = [] + num_cores = multiprocessing.cpu_count() + num_threads = max(1, num_cores - 1) # Reserve one core for other processes + num_threads = min(6, num_cores) # limit threads to avoid GitHub API rate limit + + # Create a ThreadPoolExecutor with the specified number of threads + with ThreadPoolExecutor(max_workers=num_threads) as executor: + + # Submit the process_commit function for each commit to the executor + futures = [ + executor.submit( + process_execution_state_item, + timestamp, + commit, + repository_config, + target_metadata_config, + token, + ) + for commit in commits[:limit] + ] + + # Collect the results as they complete + results = [future.result() for future in as_completed(futures)] + + # Separate the results into execution state items and error shas + execution_state_items = [ + result[1] for result in results if result[1] is not None + ] + error_shas = [result[0] for result in results if result[0] is not None] + + return ( + error_shas, + [ + ExecutionStateItem(**item) + for item in sort_execution_state_items(filter_nulls(execution_state_items)) + ], + ) + + +# limit is int or None +# @cache_pickle +def process_execution_state_items( + timestamp: str, + commits: List[Dict[str, str]], + repository_config: RepositoryConfig, + target_metadata_config: TargetMetadataConfig, + token: str = None, + limit: None = None, + parallel: str = False, +) -> List[Dict[str, str]]: + if parallel == True: + if limit: + logger.warning("'limit' will not work if parallel processing is enabled") + return parallel_process_execution_state_items( + timestamp=timestamp, + commits=commits, + repository_config=repository_config, + target_metadata_config=target_metadata_config, + token=token, + limit=limit, + ) + else: + execution_state_items = [] + for commit in commits[:limit]: + execution_state_items.append( + process_execution_state_item( + timestamp=timestamp, + commit=commit, + repository_config=repository_config, + target_metadata_config=target_metadata_config, + token=token, + limit=limit, + ) + ) + + return [ + ExecutionStateItem(**item) + for item in sort_execution_state_items(filter_nulls(execution_state_items)) + ] + + +def get_release_version_for_commit( + commit: Union[Commit, dict], + owner: str, + repo: str, + token: str, + asset_path: str, + metadata_regex: str, +) -> int: + + try: + sha = commit["sha"] + except: + sha = commit.sha + allele_list = get_repo_asset( + owner=owner, repo=repo, token=token, path=asset_path, commit_sha=sha + ) + + release_version = find_text(metadata_regex, allele_list) + + if release_version is None: + + # TODO 2/12/24 save these shas (`fatal: reference is not a tree`) for debugging, need to get the file contents + raise Exception(f"Release version not found for commit {sha}") + + return int(release_version.replace(".", "")[:4]) + + +### debug ### +if __name__ == "__main__": + import sys + + sys.path.append(os.environ["GFEDBMODELS_PATH"]) + from pathlib import Path + import json + import boto3 + from datetime import datetime + from gfedbmodels.types import ExecutionStatus + from gfedbmodels.utils import get_utc_now + + s3 = boto3.client("s3") + + utc_now = get_utc_now() + + GITHUB_REPOSITORY_OWNER = os.environ["GITHUB_REPOSITORY_OWNER"] + GITHUB_REPOSITORY_NAME = os.environ["GITHUB_REPOSITORY_NAME"] + GITHUB_PERSONAL_ACCESS_TOKEN = os.environ["GITHUB_PERSONAL_ACCESS_TOKEN"] + + test_path = json.loads( + ( + Path(__file__).parent.parent.parent.parent + / "functions" + / "check_source_update" + / "most-recent-commits.json" + ).read_text() + ) + commits_with_releases = [] + for commit in test_path: + + # Get data source configuration + source_repo_config = read_source_config( + s3_client=s3, + bucket=os.environ["DATA_BUCKET_NAME"], + key=os.environ["PIPELINE_SOURCE_CONFIG_S3_PATH"], + ).repositories[f"{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}"] + + # Loop through available file assets containing release version metadata + for asset_config in source_repo_config.target_metadata_config.items: + + # Get the release version for the commit by examining file asset contents + release_version = get_release_version_for_commit( + commit=commit, + owner=GITHUB_REPOSITORY_OWNER, + repo=GITHUB_REPOSITORY_NAME, + token=GITHUB_PERSONAL_ACCESS_TOKEN, + asset_path=asset_config.asset_path, + metadata_regex=asset_config.metadata_regex, + ) + logger.info( + f'Found release version {release_version} for commit {commit["sha"]}' + ) + + # Build the execution object to be stored in the state table (`execution__*` fields) + execution_detail = ExecutionDetailsConfig( + **{"version": release_version, "status": ExecutionStatus.PENDING} + ) + + # Build the repository object to be stored in the state table (`repository__*` fields) + repository_config = RepositoryConfig( + **{ + "owner": GITHUB_REPOSITORY_OWNER, + "name": GITHUB_REPOSITORY_NAME, + "url": f"https://github.com/{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}", + } + ) + + # Assemble the execution state item for the new commit + execution_state_item = ExecutionStateItem( + created_utc=utc_now, + execution=execution_detail, + repository=repository_config, + commit=Commit.from_response_json(commit), + ) + commits_with_releases.append(execution_state_item) + + # break the loop if successful + break diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/types.py b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/types.py new file mode 100644 index 00000000..61a0d2f4 --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/types.py @@ -0,0 +1,370 @@ +import re +from datetime import datetime +from typing import Optional +from enum import Enum +from pydantic import BaseModel, validator, root_validator +import jmespath +from .utils import restore_nested_json, filter_nested_nulls + + +class ExecutionStatus(str, Enum): + """ + ExecutionStatus is synced using the Step Functions DynamoDB integration: + NOT_PROCESSED: never processed (set by CheckSourceUpdate) ✅ + SKIPPED: never processed (set by CheckSourceUpdate) ✅ + PENDING: state machine execution started (set by CheckSourceUpdate) ✅ + BUILD_IN_PROGRESS: build started (set by State Machine) ✅ + BUILD_SUCCESS: build succeeded (set by State Machine) ✅ + LOAD_IN_PROGRESS: load started (set by State Machine) ✅ + LOAD_SUCCESS: load succeeded (set by State Machine) ✅ + LOAD_FAILED: load failed (set by State Machine) ✅ + LOAD_INVALID: load invalid from query results (set by State Machine) ✅ + LOAD_SKIPPED: load skipped (set by State Machine) ✅ + BUILD_FAILED: build failed (set by State Machine) ✅ + EXECUTION_FAILED: build or load failed (set by State Machine) ✅ + ABORTED: build or load aborted (set by UpdateExecutionState) ✅ + """ + + NOT_PROCESSED = "NOT_PROCESSED" + SKIPPED = "SKIPPED" + PENDING = "PENDING" + BUILD_IN_PROGRESS = "BUILD_IN_PROGRESS" + BUILD_SUCCESS = "BUILD_SUCCESS" + BUILD_FAILED = "BUILD_FAILED" + LOAD_IN_PROGRESS = "LOAD_IN_PROGRESS" + LOAD_COMPLETE = "LOAD_COMPLETE" + LOAD_SUCCESS = "LOAD_SUCCESS" + LOAD_FAILED = "LOAD_FAILED" + LOAD_INVALID = "LOAD_INVALID" + LOAD_SKIPPED = "LOAD_SKIPPED" + EXECUTION_FAILED = "EXECUTION_FAILED" + ABORTED = "ABORTED" + + @classmethod + def __contains__(cls, item): + return item in cls.__members__ + + +def str_to_datetime(v, fmt="%Y-%m-%dT%H:%M:%S.%fZ"): + return datetime.strptime(v, fmt) + + +def str_from_datetime(v, fmt="%Y-%m-%dT%H:%M:%SZ"): + return v.strftime(fmt) + + +# validate that date field is ISO 8601 format with timezone +def date_is_iso_8601_with_timezone(v): + # Check if the date is already in the desired ISO 8601 format with 3 milliseconds + if re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$", v): + return v + + # Check if the date is in ISO 8601 format with fractional seconds (arbitrary number of digits) + match = re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z$", v) + if match: + fractional_seconds = v.split(".")[1].split("Z")[0] + # Truncate or pad fractional seconds to 3 digits + truncated_fractional_seconds = fractional_seconds[:3].ljust(3, "0") + return v.replace(fractional_seconds, truncated_fractional_seconds) + + # Check if the date is in ISO 8601 format without fractional seconds + if re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$", v): + # Add milliseconds and return + return v[:-1] + ".000Z" + + raise ValueError("Date must be in ISO 8601 format with timezone") + + +# validate that url is a valid URL +def url_is_valid(v): + if not re.match(r"^https?://", v): + raise ValueError("Url must be a valid URL") + return v + + +release_version_re = r"^[1-9][0-9]{1,2}0$" + +def version_is_valid(v, return_bool=False): + + if not return_bool: + if not re.match(release_version_re, str(v)): + raise ValueError( + f"Release version must match regex pattern `{release_version_re}`" + ) + return v + + elif return_bool: + return bool(re.match(release_version_re, str(v))) + + +# validate that commit sha is a 40 character hex string +def commit_sha_is_hex(v): + if not re.match(r"^[0-9a-f]{40}$", v): + raise ValueError(f"Commit sha must be a 40 character hex string matching regex pattern `^[0-9a-f]{40}$`: {v}") + return v + + +def s3_path_is_valid(v): + if not re.match(r"^s3://", v): + raise ValueError("S3 path must be a valid S3 path (s3:///)") + return v + + +### Source Config Models ### +class Commit(BaseModel): + sha: str + date_utc: str + message: Optional[str] = None + html_url: str + # # TODO the url field returns a JSON object with a list of files; see all-branches for more info + # url: str + + @validator("sha") + def _commit_sha_is_hex(cls, v): + return commit_sha_is_hex(v) + + # implement jmespath mapping to create the commit Class + @classmethod + def from_response_json(cls, response_json): + return cls( + sha=jmespath.search("sha", response_json), + date_utc=jmespath.search("commit.committer.date", response_json), + message=jmespath.search("commit.message", response_json), + html_url=jmespath.search("html_url", response_json), + ) + + # validate that date is ISO 8601 format with timezone + @validator("date_utc") + def date_utc_is_iso_8601_with_timezone(cls, v): + return date_is_iso_8601_with_timezone(v) + + + +class InputParameters(BaseModel): + align: bool + kir: bool + mem_profile: bool + limit: Optional[int] = -1 + use_existing_build: Optional[bool] = False + skip_load: Optional[bool] = False + + # validate that limit is an integer equalt to -1 or greater than 0 but not equal to 0 + @validator("limit") + def limit_is_valid(cls, v): + if v == -1: + return v + elif v > 0: + return v + else: + raise ValueError("Limit must be an integer equal to -1 or greater than 0") + + +class ExcludedCommitShas(BaseModel): + description: Optional[str] = None + values: list[str] + + @validator("values") + def _commit_shas_are_hex(cls, v): + for sha in v: + sha = commit_sha_is_hex(sha) + return v + + +class TrackedAssetsConfig(BaseModel): + description: Optional[str] = None + values: list[str] + + +class TargetMetadataConfigItem(BaseModel): + description: Optional[str] = None + asset_path: str # Path (in remote git repository, on GitHub) to the file to check for strings matching a release version + metadata_regex: str # Contextual metadata for the commit. This is the regex to match the release version string that might be found in the assets being checked + + +class TargetMetadataConfig(BaseModel): + description: Optional[str] = None + items: list[TargetMetadataConfigItem] + + +class RepositoryConfig(BaseModel): + owner: str + name: str + description: Optional[str] = None + url: str + tracked_assets: Optional[TrackedAssetsConfig] = None + target_metadata_config: Optional[TargetMetadataConfig] = None + excluded_commit_shas: Optional[ExcludedCommitShas] = None + default_input_parameters: Optional[InputParameters] = None + + # validate that the url is a valid URL + @validator("url") + def url_is_valid(cls, v): + return url_is_valid(v) + + +class ExecutionDetailsConfig(BaseModel): + id: str = ( + None # Refers to execution id in Step Functions and is only set if the commit is processed + ) + invocation_id: str = ( + None # One invocation can have multiple executions depending on how many release versions are given + ) + version: int + status: str + date_utc: Optional[str] = None + input_parameters: Optional[InputParameters] = None + s3_path: Optional[str] = None + receipt_handle: Optional[str] = None + # TODO move execution errors here + + @validator("status") + def status_is_valid(cls, v): + if v not in ExecutionStatus.__members__: + raise ValueError( + f"Status must be one of {[value.value for value in ExecutionStatus.__members__.values()]}" + ) + return v + + # validate that version is a 4 digit number, position 0 is a number between 1 and 9, and position 1:2 is a number between 0 and 99 and position 3 is 0 + @validator("version") + def _version_is_valid(cls, v): + return version_is_valid(v) + + +class SourceConfig(BaseModel): + created_utc: Optional[str] = None + updated_utc: Optional[str] = None + repositories: dict[str, RepositoryConfig] + + # validate dates are ISO 8601 format with timezone for created_utc, updated_utc + @validator("created_utc", "updated_utc") + def date_utc_is_iso_8601_with_timezone(cls, v): + return date_is_iso_8601_with_timezone(v) + + +# Uses similar schema from Step Functions Fail state +class ExecutionError(BaseModel): + message: str + cause: str + + +# One item in the ExecutionState table +# The Primary Key is commit.sha, or commit__sha in the table +class ExecutionStateItem(BaseModel): + created_utc: Optional[str] = ( + None # Partial updates may not be able to include timestamps + ) + updated_utc: Optional[str] = ( + None # Partial updates may not be able to include timestamps + ) + repository: Optional[RepositoryConfig] + commit: Commit + execution: ExecutionDetailsConfig + error: Optional[ExecutionError] = None + s3_path: Optional[str] = None + + @classmethod + def from_execution_state_item_json(cls, execution_state_item: dict): + # Items from table are separated by "__" because "." is not allowed in DynamoDB + execution_state_item = restore_nested_json(execution_state_item, split_on="__") + return cls(**execution_state_item) + + # validate s3 path uses s3:/// format + @validator("s3_path") + def s3_path_is_valid(cls, v): + return s3_path_is_valid(v) + + +class ExecutionState(BaseModel): + created_utc: str + items: list[ExecutionStateItem] + + @root_validator(pre=True) + def set_items_created_utc(cls, values): + timestamp_utc = values.get("created_utc") + items = values.get("items", []) + try: + for item in items: + item.created_utc = timestamp_utc + except: + for item in items: + item["created_utc"] = timestamp_utc + return values + + # validate that items is sorted by commit.date_utc descending + @validator("items") + def execution_state_is_sorted(cls, v): + if not all( + v[i].commit.date_utc >= v[i + 1].commit.date_utc for i in range(len(v) - 1) + ): + raise ValueError( + "Execution history must be sorted by commit.date_utc descending" + ) + return v + + # Releases are formatted as a 3 or 4 digit integer incrementing by 10 with a lower bound of 300 + # Based on the formatting described, validate that no releases are missing from items + # Remember that the items is sorted by commit.date_utc descending and that release versions decrement by 10 + @validator("items") + def execution_state_has_no_missing_releases(cls, items): + actual_unique_release_versions = sorted( + list(set([item.execution.version for item in items])), reverse=True + ) + + first_version = 300 + expected_version = items[0].execution.version + for idx, actual_version in enumerate(actual_unique_release_versions): + + # Make custom adjust for releases under 3100 to account for inconsistent versioning + if actual_version == 390: + expected_version = int( + str(expected_version)[:1] + str(expected_version)[2:] + ) + + # If the version is not the expected version, raise an error + if actual_version != expected_version: + raise ValueError( + f"Execution history is missing version {expected_version}" + ) + + # If the version is the first version, then the expected version should be the last version + if actual_version == first_version: + if idx != len(actual_unique_release_versions) - 1: + raise ValueError( + f"Execution history has an unexpected version {actual_version}" + ) + break + + # Since release versioning increments by 10 in a complete dataset (all version), get the expected value using math + expected_version -= 10 + + return items + + +class ExecutionPayloadItem(BaseModel): + id: str + version: int + invocation_id: str + commit_sha: str + input_parameters: InputParameters + s3_path: str + + @validator("version") + def _version_is_valid(cls, v): + return version_is_valid(v) + + @validator("commit_sha") + def _commit_sha_is_hex(cls, v): + return commit_sha_is_hex(v) + + # create the ExecutionPayloadItem from ExecutionStateItem + @classmethod + def from_execution_state_item(cls, execution_state_item): + return cls( + id=execution_state_item.execution.id, + version=execution_state_item.execution.version, + invocation_id=execution_state_item.execution.invocation_id, + commit_sha=execution_state_item.commit.sha, + input_parameters=execution_state_item.execution.input_parameters, + s3_path=execution_state_item.execution.s3_path, + ) diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/utils.py b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/utils.py new file mode 100644 index 00000000..dc7132ba --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/gfedbmodels/utils.py @@ -0,0 +1,567 @@ +import os +import logging +from typing import List, Dict, Union +from pydantic import BaseModel +from pathlib import Path +from itertools import chain, starmap +from datetime import datetime +import json +import pickle +import re +import requests +from botocore.exceptions import ClientError + +# Logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +AWS_REGION = os.environ["AWS_REGION"] + +cache_dir = Path(__file__).parent / "_cache" + +# TODO clear cache +# TODO disable/enable cache for testing + + +def get_utc_now(): + return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + + +def save_json_to_cache(data, var_name): + """Saves data to cache directory""" + if not cache_dir.exists(): + cache_dir.mkdir() + + # Handles different types of JSON representations, if it fails to serialize, remove the file + try: + if isinstance(data, dict) or isinstance(data, List): + try: + with open(cache_dir / var_name, "w") as f: + json.dump(data, f, indent=4) + except: + # assume it's a list of pydantic models + with open(cache_dir / var_name, "w") as f: + json.dump([item.model_dump() for item in data], f, indent=4) + except Exception as e: + logger.error(f"Failed to serialize {var_name} to JSON: {e}") + # remove the file if it exists + if (cache_dir / var_name).exists(): + (cache_dir / var_name).unlink() + + +def save_pickle_to_cache(data, var_name): + """Saves data to cache directory""" + if not cache_dir.exists(): + cache_dir.mkdir() + with open(cache_dir / var_name, "wb") as f: + pickle.dump(data, f) + + +def load_json_from_cache(var_name): + """Loads data from cache directory""" + with open(cache_dir / var_name, "r") as f: + data = json.load(f) + return data + + +def load_pickle_from_cache(var_name): + """Loads data from cache directory""" + with open(cache_dir / var_name, "rb") as f: + data = pickle.load(f) + return data + + +# implement a @cache_json decorator to cache the results of the function in a file or load from cache if it exists +def cache_json(func): + """Decorator to cache function results""" + + def wrapper(*args, **kwargs): + var_name = func.__name__ + if (cache_dir / var_name).exists(): + logger.info(f"Loading {var_name} from cache") + return load_json_from_cache(var_name) + else: + logger.info(f"Saving {var_name} to cache") + data = func(*args, **kwargs) + save_json_to_cache(data, var_name) + return data + + return wrapper + + +# rewrite the cache_json decorator to work for pickle files +def cache_pickle(func): + """Decorator to cache function results""" + + def wrapper(*args, **kwargs): + var_name = func.__name__ + if (cache_dir / var_name).exists(): + logger.info(f"Loading {var_name} from cache") + return load_pickle_from_cache(var_name) + else: + logger.info(f"Saving {var_name} to cache") + data = func(*args, **kwargs) + save_pickle_to_cache(data, var_name) + return data + + return wrapper + + +def flatten_json(data, sep=".", skip_fields=[], select_fields=[]): + """Flatten a nested json file. For a list of dictionaries, use this + inside a for loop before converting to pandas DataFrame. + + Args: + data (dict): nested json file + sep (str, optional): separator for flattened keys. Defaults to ".". + skip_fields (list, optional): list of fields to skip. Defaults to []. + select_fields (list, optional): list of output fields to select including the separator. Defaults to []. + """ + + def unpack(parent_key, parent_value): + """Unpack one level of nesting in json file""" + # Unpack one level only!!! + + if isinstance(parent_value, dict): + for key, value in parent_value.items(): + temp1 = parent_key + sep + key + yield temp1, value + elif isinstance(parent_value, list): + i = 0 + for value in parent_value: + temp2 = parent_key + sep + str(i) + i += 1 + yield temp2, value + else: + yield parent_key, parent_value + + # Keep iterating until the termination condition is satisfied + while True: + # Keep unpacking the json file until all values are atomic elements (not data or list) + data = dict(chain.from_iterable(starmap(unpack, data.items()))) + # Terminate condition: not any value in the json file is data or list + if not any(isinstance(value, dict) for value in data.values()) and not any( + isinstance(value, list) for value in data.values() + ): + break + + if len(skip_fields) > 0: + data = {k: v for k, v in data.items() if k not in skip_fields} + + if len(select_fields) > 0: + data = {k: v for k, v in data.items() if k in select_fields} + + return data + + +def read_s3_json(s3_client, bucket, key): + """Reads config file containing the current state of branches in + a GitHub repo""" + + try: + response = s3_client.get_object(Bucket=bucket, Key=key) + return json.loads(response["Body"].read().decode()) + + except ClientError as err: + logger.error(f"Failed to read config file to s3://{bucket}/{key}") + raise err + + +def write_s3_json(s3_client, bucket, key, data): + """Writes config file containing the current state of branches in + a GitHub repo""" + + try: + response = s3_client.put_object( + Bucket=bucket, Key=key, Body=json.dumps(data).encode() + ) + + except Exception as err: + logger.error( + f'Failed to write config file to s3://{bucket}/{key}. HTTPStatusCode: {response["ResponseMetadata"]["HTTPStatusCode"]}' + ) + raise err + + +def list_commits(owner, repo, token, **params): + """Return a list of GitHub commits for the specified repository""" + + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/commits" + + url = base_url + endpoint + + # params = { + # "per_page": kwargs.get("per_page"), + # "page": kwargs.get("page"), + # } + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + + return response.json() + + +@cache_json +def paginate_commits(owner, repo, start_page=1, per_page=100, **kwargs): + page = start_page + commits = [] + while True: + response = list_commits(owner, repo, page=page, per_page=per_page, **kwargs) + if len(response) == 0: + break + # logger.debug(f"Page {page}: {len(response)} commits") + commits.extend(response) + page += 1 + + if len(commits) == 0: + raise ValueError("No commits found") + + return commits + + +def get_commit(owner, repo, token, commit_sha): + """Return the commit for the specified repository and commit SHA""" + + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/commits/{commit_sha}" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + return response.json() + + +def get_file_contents(owner, repo, token, path): + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/contents/{path}" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + return response.json() + + +def get_commits_for_asset(owner, repo, token, path, since=None): + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/commits" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + params = { + "path": path, + # validate date is in ISO 8601 format + "since": since.isoformat() if isinstance(since, datetime) else since, + } + + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + + return response.json() + + +def get_repo_contents(owner, repo, token, path, commit_sha=None): + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/contents/{path}" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + params = {"ref": commit_sha} + + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + + # # check status + # if response.status_code != 200: + # logger.debug(json.dumps(response.json())) + # raise Exception(f"Asset not found at path '{path}'") + # else: + return response.json() + + +def get_repo_asset(owner, repo, token, path, commit_sha=None): + """Download a file from a GitHub repository""" + + # # todo debug error shas + # if commit_sha in [ + # "8d77b3dd93959663d58ae5b626289d0746edd0e7", + # "252d7c5dc9d2f7671447fd11fe6bb004c438f34b", + # "e1cd1ec3e66f4ab2b218f6758ed315f557778655", + # "fa208da83a7f96d62c1e4efee2018074bbd805e0", + # "09ed08b9abcd97622d59ec37e31b4706dc9a9391", + # "8db938b1eb58dd8c77cba9b7524f84cf8ffe719c", + # "041318439bf0ba291f990faaa27cd6ad0a062d13", + # "ba5cb3d05c7b3ba5024cdafa192d89af186f08a9", + # "7ca4eb239a96884142d3ef0b0182d3bc84ec1bba", + # "3abe7e12dcbc3824315959af4428c53bd760c6e7", + # "c4d3f67ef7ef4b5f6571b4f1d4aa5b928d2a3d56", + # "23044ee80c27f75bb34c9f9ac689b1c68cd65914" + # ]: + # print(f"Error sha: {commit_sha}") + + repo_contents = get_repo_contents(owner, repo, token, path, commit_sha) + + response = requests.get(repo_contents["download_url"]) + response.raise_for_status() + + # if response.status_code != 200: + # logger.error(f"Status code {response.status_code} for {path}") + # raise Exception(f"Error downloading {path}") + + return response.text + + +def get_branches(owner, repo, token): + """Fetch branches for a GitHub repository""" + + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/branches" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + return response.json() + + +def get_branch(owner, repo, token, branch_name): + """Fetch branches for a GitHub repository""" + + base_url = "https://api.github.com" + + # Endpoint + endpoint = f"/repos/{owner}/{repo}/branches/{branch_name}" + url = base_url + endpoint + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + return response.json() + + +# Function to fetch pull requests +def get_pull_requests(owner, repo, token): + url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all" + + # Headers + headers = { + "Authorization": f"token {token}", + "Content-Type": "application/json", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + response = requests.get(url, headers=headers) + + if response.status_code == 200: + return response.json() + else: + print(f"Error: {response.status_code}") + return [] + + +# def merge_release_version_with_commit(unique_shas, release_versions): +# # Convert release_versions to a dictionary for easier lookup +# release_versions_dict = {sha: version for sha, version in release_versions} + +# # Merge the arrays using a list comprehension +# merged_data = [(release_versions_dict[sha], sha, date) for sha, date in unique_shas] + +# # sort by date +# merged_data.sort(key=lambda x: x[2]) + +# return merged_data + + +def select_keys(d, keys): + """Selects keys from a dictionary""" + return {k: v for k, v in d.items() if k in keys} + + +def select_fields(dataset, fields): + """Select the fields for each record in an array of JSON objects""" + return [select_keys(x, fields) for x in dataset] + + +def rename_keys(d, key_names_map): + """Rename keys in a dictionary""" + return {key_names_map[k]: v for k, v in d.items()} + + +def rename_fields(dataset: List[dict], key_names_map: dict[str, str]): + """Rename fields in each record in an array of JSON objects""" + return [rename_keys(x, key_names_map) for x in dataset] + + +def flatten_json_records( + data, sep=".", skip_fields=[], select_fields=[], filter_nulls=True +): + """Flatten a list of JSON records.""" + if filter_nulls: + return [ + filter_null_fields( + flatten_json( + data=record, + sep=sep, + skip_fields=skip_fields, + select_fields=select_fields, + ) + ) + for record in data + ] + else: + return [ + flatten_json( + data=record, + sep=sep, + skip_fields=skip_fields, + select_fields=select_fields, + ) + for record in data + ] + + +def restore_nested_json(data: dict, split_on="."): + """Restores a previously flattened JSON object into a nested JSON object. + + Args: + data (dict): A flattened JSON object. + + Returns: + dict: A nested JSON object. + """ + result = {} + for key, value in data.items(): + parts = key.split(split_on) + current = result + for part in parts[:-1]: + if part not in current: + current[part] = {} + current = current[part] + current[parts[-1]] = value + return result + + +def find_text(pattern, input_str): + match = re.search(pattern, input_str) + if match: + text = match.group(0) + return text + + +def filter_nulls(items: List[Dict[str, str]]) -> List[Dict[str, str]]: + """Filter out null items from a list of dictionaries + + Args: + items (List[Dict[str, str]]): A list of dictionaries + + Returns: + List[Dict[str, str]]: A list of dictionaries with null items removed + """ + return [x for x in items if x is not None] + + +def filter_null_fields(items: dict) -> dict: + """Filter out null fields from a dictionary + + Args: + items (dict): A dictionary + + Returns: + dict: A dictionary with null fields removed + """ + return {k: v for k, v in items.items() if v is not None} + + +def filter_nested_nulls(data: Union[dict, list]): + """Filter out null fields from a nested dictionary or list of dictionaries + + Args: + data (Union[dict, list]): A nested dictionary or list of dictionaries + + Returns: + Union[dict, list]: A nested dictionary or list of dictionaries with null fields removed + """ + if isinstance(data, list): + return filter_nulls([filter_nested_nulls(i) for i in data]) + elif isinstance(data, dict): + return filter_null_fields({k: filter_nested_nulls(v) for k, v in data.items()}) + else: + return data + + +def sort_execution_state_items( + execution_state_items: List[Dict[str, str]], ascending=False +) -> List[Dict[str, str]]: + return sorted( + execution_state_items, + key=lambda x: x["commit"].date_utc, + reverse=(not ascending), + ) diff --git a/gfe-db/pipeline/lambda_layers/gfe_db_models/requirements.txt b/gfe-db/pipeline/lambda_layers/gfe_db_models/requirements.txt new file mode 100644 index 00000000..06394a3b --- /dev/null +++ b/gfe-db/pipeline/lambda_layers/gfe_db_models/requirements.txt @@ -0,0 +1,6 @@ +# --extra-index-url https://test.pypi.org/simple/ +pydantic~=2.0.0 +requests~=2.22.0 +urllib3<2 +aws-parameters~=0.1.8 +pygethub~=0.1.1 \ No newline at end of file diff --git a/gfe-db/pipeline/scripts/requirements.txt b/gfe-db/pipeline/scripts/requirements.txt new file mode 100644 index 00000000..d286715d --- /dev/null +++ b/gfe-db/pipeline/scripts/requirements.txt @@ -0,0 +1,5 @@ +pydantic~=2.0.0 +requests~=2.31.0 +boto3~=1.15.3 +aws-parameters~=0.1.8 +pygethub~=0.1.2 \ No newline at end of file diff --git a/gfe-db/pipeline/scripts/state/__init__.py b/gfe-db/pipeline/scripts/state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gfe-db/pipeline/scripts/state/build.py b/gfe-db/pipeline/scripts/state/build.py new file mode 100644 index 00000000..a2fed766 --- /dev/null +++ b/gfe-db/pipeline/scripts/state/build.py @@ -0,0 +1,161 @@ +""" +Builds the execution state for the given repository source from the static repository source configuration (`source-config.json`). +""" + +import os +import sys + +# for dev, local path to gfe-db modules +# ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) +sys.path.append(os.environ["GFEDBMODELS_PATH"]) + +from pathlib import Path +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +# from datetime import datetime +import json +from pygethub import list_branches, GitHubPaginator +from gfedbmodels.utils import get_utc_now, select_keys, filter_nested_nulls, get_commit +from gfedbmodels.types import ( + version_is_valid, + SourceConfig, + RepositoryConfig, + Commit, + ExecutionStateItem, + ExecutionDetailsConfig, + ExecutionState, +) + +# Environment variables +GITHUB_REPOSITORY_OWNER = os.environ["GITHUB_REPOSITORY_OWNER"] +GITHUB_REPOSITORY_NAME = os.environ["GITHUB_REPOSITORY_NAME"] +GITHUB_PERSONAL_ACCESS_TOKEN = os.environ["GITHUB_PERSONAL_ACCESS_TOKEN"] + + +def get_branch_commits(branches): + + # For each entry in all-branches, get the commit data and build the execution state item + execution_state_items = [] + + for item in branches: + + if not version_is_valid(item["name"], return_bool=True): + continue + + release_version = item["name"] + sha = item["commit"]["sha"] + + logger.info(f"Retrieving data for {sha}") + commit_json = get_commit( + GITHUB_REPOSITORY_OWNER, + GITHUB_REPOSITORY_NAME, + GITHUB_PERSONAL_ACCESS_TOKEN, + sha, + ) + assert sha == commit_json["sha"] + + commit = Commit( + sha=commit_json["sha"], + date_utc=commit_json["commit"]["author"]["date"], + message=commit_json["commit"]["message"], + html_url=commit_json["html_url"], + ) + + execution_state_item = ExecutionStateItem( + created_utc=utc_now, + updated_utc=utc_now, + commit=commit, + execution=ExecutionDetailsConfig( + version=release_version, + status="NOT_PROCESSED", + date_utc=None, + input_parameters=None, + ), + # error=None, + # s3_path=None, + repository=RepositoryConfig( + **select_keys( + source_config.repositories[ + GITHUB_REPOSITORY_OWNER + "/" + GITHUB_REPOSITORY_NAME + ].model_dump(), + ["owner", "name", "url"], + ) + ), + ) + execution_state_items.append(execution_state_item) + + return execution_state_items + + +def build_execution_state(branches, utc_now=None): + + utc_now = utc_now or get_utc_now() + + # Create ExecutionStateItems array from branch/commit sha pairs + execution_state_items = get_branch_commits(branches) + + # Sort execution state items by date descending + execution_state_items = sorted( + execution_state_items, key=lambda x: x.commit.date_utc, reverse=True + ) + + # Package records as ExecutionState object to seed table + execution_state = ExecutionState( + **{ + "created_utc": utc_now, + "items": execution_state_items, + } + ) + + return execution_state + + +if __name__ == "__main__": + + utc_now = get_utc_now() + + # Paths + try: + output_dir = Path(sys.argv[1]) + except IndexError: + raise ValueError("Output directory must be specified as first argument") + + with open(output_dir / "source-config.json", "r") as f: + source_config = SourceConfig(**json.load(f)) + + # Fetch all commits from repo using GitHub API, will be cached + logger.info("Processing source repository data") + + paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN) + + ### COMMITS BY BRANCHES ### + branch_pages = paginator.get_paginator( + list_branches, + owner=GITHUB_REPOSITORY_OWNER, + repo=GITHUB_REPOSITORY_NAME, + user_agent="nmdp-bioinformatics-gfe-db-state-builder/1.0", + ) + all_branches = list(branch_pages) + + execution_state = build_execution_state(all_branches, utc_now) + + # Updates the source config file but does not actually build it + source_config.created_utc, source_config.updated_utc = utc_now, utc_now + + logger.info( + f"Writing execution state to {str(output_dir / 'execution-state.json')}" + ) + + # write ExecutionState locally + with open(output_dir / "execution-state.json", "w") as f: + json.dump(filter_nested_nulls(execution_state.model_dump()), f, indent=4) + + logger.info(f"Updating source config in {str(output_dir / 'source-config.json')}") + + # write SourceConfig locally + with open(output_dir / f"source-config.json", "w") as f: + json.dump(source_config.model_dump(), f, indent=4) + + logger.info("Execution state and source config updated") diff --git a/gfe-db/pipeline/scripts/state/load.py b/gfe-db/pipeline/scripts/state/load.py new file mode 100644 index 00000000..ec338e48 --- /dev/null +++ b/gfe-db/pipeline/scripts/state/load.py @@ -0,0 +1,70 @@ +""" +Loads the initial gfe-db execution state to DynamoDB table. + +TODO Sync state to local script so it can be reloaded from local +TODO solution to avoid overwriting data when running this script (regular DynamoDB backups to S3 etc, fetch file from S3 and compare) +""" +import os +from pathlib import Path +import os +import sys + +# for dev, local path to gfe-db modules +# ./gfe-db/pipeline/lambda_layers/gfe_db_models (use absolute path) +sys.path.append(os.environ["GFEDBMODELS_PATH"]) + +import logging +import json +from gfedbmodels.constants import ( + session, + pipeline +) +from gfedbmodels.types import ( + ExecutionState, +) +from gfedbmodels.utils import flatten_json_records, get_utc_now + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ssm = session.clients["ssm"] +dynamodb = session.resource("dynamodb") + +execution_state_table_fields = pipeline.params.GfeDbExecutionStateTableFields +execution_state_table_name = pipeline.params.GfeDbExecutionStateTableName + +table = dynamodb.Table(execution_state_table_name) + +if __name__ == "__main__": + + # TODO scan table for existing items and throw error if not empty, require --overwrite flag to proceed + + utc_now = get_utc_now() + + # Paths + input_dir = Path(sys.argv[1]) + + # read in source config JSON file from local + with open(input_dir / "execution-state.json", "r") as f: + execution_state = ExecutionState(**json.load(f)) + + # flatten JSON records for execution state table model + # Uses double-underscore as separator because DynamoDB does not allow dots in attribute names + execution_state_flat = flatten_json_records( + execution_state.model_dump()["items"], + sep="__", + select_fields=[ + item.replace(".", "__") for item in execution_state_table_fields + ], + filter_nulls=True, + ) + + # load to dynamodb table named execution_state_table_name using batch put + with table.batch_writer() as batch: + logger.info( + f"Loading {len(execution_state_flat)} items to {execution_state_table_name}" + ) + for item in execution_state_flat: + batch.put_item(Item=item) + + logger.info(f"Success") diff --git a/gfe-db/pipeline/statemachines/load-concurrency-manager.asl.json b/gfe-db/pipeline/statemachines/load-concurrency-manager.asl.json new file mode 100644 index 00000000..dee61a32 --- /dev/null +++ b/gfe-db/pipeline/statemachines/load-concurrency-manager.asl.json @@ -0,0 +1,293 @@ +{ + "StartAt": "Pre-Execution Backup", + "States": { + "Pre-Execution Backup": { + "Type": "Task", + "Resource": "${InvokeBackupScriptFunctionArn}", + "ResultPath": "$.backups.pre", + "Next": "Receive SQS Message", + "Retry": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "IntervalSeconds": 5, + "MaxAttempts": 2, + "BackoffRate": 1.5 + } + ], + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Receive SQS Message" + } + ] + }, + "Receive SQS Message": { + "Type": "Task", + "Resource": "${LcmReceiveMessageFunctionArn}", + "ResultPath": "$.sqs", + "Next": "Message Received?" + }, + "Message Received?": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.sqs.MessageId", + "IsPresent": true, + "Next": "Handle Load Query Execution" + } + ], + "Default": "Wait for Message" + }, + "Wait for Message": { + "Type": "Wait", + "Seconds": 30, + "Next": "Check Alarm State" + }, + "Handle Load Query Execution": { + "Type": "Parallel", + "OutputPath": "$.[0]", + "Next": "Check Alarm State", + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Check Alarm State" + } + ], + "Branches": [ + { + "StartAt": "Execute Load Query", + "States": { + "Execute Load Query": { + "Type": "Task", + "Resource": "${InvokeLoadScriptFunctionArn}", + "ResultPath": "$", + "Next": "Sleep 10 Seconds", + "Retry": [ + { + "ErrorEquals": ["States.ALL"], + "IntervalSeconds": 5, + "MaxAttempts": 5, + "BackoffRate": 2.0 + } + ] + }, + "Sleep 10 Seconds": { + "Type": "Wait", + "Seconds": 10, + "Next": "Get Load Command Status" + }, + "Get Load Command Status": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:ssm:getCommandInvocation", + "Parameters": { + "CommandId.$": "$.ssm.CommandId", + "InstanceId.$": "$.ssm.InstanceId", + "PluginName": "runShellScript" + }, + "Next": "Evaluate Load Command Status", + "ResultSelector": { + "CommandId.$": "$.CommandId", + "InstanceId.$": "$.InstanceId", + "Status.$": "$.Status", + "StandardOutputContent.$": "$.StandardOutputContent", + "StandardErrorContent.$": "$.StandardErrorContent" + }, + "ResultPath": "$.ssm", + "Retry": [ + { + "ErrorEquals": ["States.ALL"], + "IntervalSeconds": 5, + "MaxAttempts": 5, + "BackoffRate": 2.0 + } + ] + }, + "Evaluate Load Command Status": { + "Type": "Choice", + "Choices": [ + { + "Or": [ + { + "Variable": "$.ssm.Status", + "StringEquals": "Failed" + }, + { + "Variable": "$.ssm.Status", + "StringEquals": "Cancelled" + }, + { + "Variable": "$.ssm.Status", + "StringEquals": "TimedOut" + } + ], + "Next": "Return Message to Load Queue" + }, + { + "Variable": "$.ssm.Status", + "StringEquals": "Success", + "Next": "Delete Message from Load Queue" + } + ], + "Default": "Check Status" + }, + "Return Message to Load Queue": { + "Type": "Task", + "InputPath": "$", + "Resource": "arn:aws:states:::aws-sdk:sqs:changeMessageVisibility", + "Parameters": { + "QueueUrl": "${GfeDbLoadQueueUrl}", + "ReceiptHandle.$": "$.sqs.ReceiptHandle", + "VisibilityTimeout": 0 + }, + "ResultPath": null, + "Next": "Update Status → LOAD_FAILED" + }, + "Update Status → LOAD_FAILED": { + "Type": "Pass", + "Result": "LOAD_FAILED", + "ResultPath": "$.sqs.Body.state.execution.status", + "Next": "Sync Status → LOAD_FAILED" + }, + "Sync Status → LOAD_FAILED": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$.sqs.Body.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $.sqs.Body.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc, error__message = :error, error__cause = :cause", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.sqs.Body.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + }, + ":error": { + "S.$": "$.ssm.Status" + }, + ":cause": { + "S.$": "$.ssm.StandardErrorContent" + } + } + } + }, + "Check Status": { + "Type": "Wait", + "Seconds": 30, + "Next": "Get Load Command Status" + }, + "Delete Message from Load Queue": { + "Type": "Task", + "InputPath": "$", + "Resource": "arn:aws:states:::aws-sdk:sqs:deleteMessage", + "Parameters": { + "QueueUrl": "${GfeDbLoadQueueUrl}", + "ReceiptHandle.$": "$.sqs.ReceiptHandle" + }, + "ResultPath": null, + "Next": "Update Status → LOAD_COMPLETE" + }, + "Update Status → LOAD_COMPLETE": { + "Type": "Pass", + "Result": "LOAD_COMPLETE", + "ResultPath": "$.sqs.Body.state.execution.status", + "Next": "Sync Status → LOAD_COMPLETE" + }, + "Sync Status → LOAD_COMPLETE": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "InputPath": "$", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$.sqs.Body.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $.sqs.Body.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.sqs.Body.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + } + } + } + ] + }, + "Check Alarm State": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:cloudwatch:describeAlarms", + "Parameters": { + "AlarmNames": [ + "${UpdatePipelineStateMachineExecutionAlarmName}" + ] + }, + "ResultSelector": { + "StateValue.$": "$.MetricAlarms[0].StateValue" + }, + "Next": "Evaluate Alarm State" + }, + "Evaluate Alarm State": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.StateValue", + "StringMatches": "ALARM", + "Next": "Sleep 10" + }, + { + "Variable": "$.StateValue", + "StringMatches": "INSUFFICIENT_DATA", + "Next": "Sleep 10" + } + ], + "Default": "Post-Execution Backup" + }, + "Sleep 10": { + "Type": "Wait", + "Seconds": 10, + "Next": "Receive SQS Message" + }, + "Post-Execution Backup": { + "Type": "Task", + "Resource": "${InvokeBackupScriptFunctionArn}", + "ResultPath": "$.backups.post", + "Retry": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "IntervalSeconds": 5, + "MaxAttempts": 2, + "BackoffRate": 1.5 + } + ], + "End": true + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/statemachines/pipeline.asl.json b/gfe-db/pipeline/statemachines/pipeline.asl.json deleted file mode 100644 index 7b0c8040..00000000 --- a/gfe-db/pipeline/statemachines/pipeline.asl.json +++ /dev/null @@ -1,236 +0,0 @@ -{ - "StartAt": "Pre-Execution Validation Query", - "States": { - "Pre-Execution Validation Query": { - "Type": "Task", - "Resource": "${ExecuteValidationQueriesFunctionArn}", - "ResultPath": "$.validations.queries.pre", - "Next": "Pre-Execution Backup" - }, - "Pre-Execution Backup": { - "Type": "Task", - "Resource": "${InvokeBackupScriptFunctionArn}", - "ResultPath": "$.backups.pre", - "Next": "Build Stage" - }, - "Build Stage": { - "Type": "Map", - "Next": "Load Stage", - "MaxConcurrency": 0, - "ItemsPath": "$.input", - "ItemSelector": { - "input.$": "$$.Map.Item.Value" - }, - "ResultPath": "$.validations.build", - "Catch": [ - { - "ErrorEquals": [ - "States.ALL" - ], - "Next": "Execution Failed" - } - ], - "ItemProcessor": { - "StartAt": "Check Existing Build", - "States": { - "Check Existing Build": { - "Type": "Task", - "Resource": "arn:aws:states:::aws-sdk:s3:listObjects", - "Parameters": { - "Bucket": "${DataBucketName}", - "Prefix.$": "States.Format('data/{}/csv/', $.input.RELEASES)" - }, - "ResultPath": "$.check_existing_build", - "Next": "Files Exist?" - }, - "Files Exist?": { - "Type": "Choice", - "Choices": [ - { - "Variable": "$.check_existing_build.Contents", - "IsPresent": false, - "Next": "Generate CSV files" - } - ], - "Default": "Use Existing Build?" - }, - "Use Existing Build?": { - "Type": "Choice", - "Choices": [ - { - "Variable": "$.input.USE_EXISTING_BUILD", - "StringEquals": "False", - "Next": "Generate CSV files" - } - ], - "Default": "Validate Build" - }, - "Generate CSV files": { - "Type": "Task", - "Resource": "arn:aws:states:::batch:submitJob.sync", - "ResultPath": null, - "Next": "Validate Build", - "Catch": [ - { - "ErrorEquals": [ - "States.ALL" - ], - "Next": "Fail Build" - } - ], - "Parameters": { - "JobDefinition": "${BuildJobDefinition}", - "JobName": "${BuildJobName}", - "JobQueue": "${BuildJobQueue}", - "ContainerOverrides": { - "Environment": [ - { - "Name": "RELEASES", - "Value.$": "$.input.RELEASES" - }, - { - "Name": "ALIGN", - "Value.$": "$.input.ALIGN" - }, - { - "Name": "KIR", - "Value.$": "$.input.KIR" - }, - { - "Name": "MEM_PROFILE", - "Value.$": "$.input.MEM_PROFILE" - }, - { - "Name": "LIMIT", - "Value.$": "$.input.LIMIT" - } - ] - } - } - }, - "Validate Build": { - "Type": "Task", - "Resource": "${ValidateBuildOutputFunctionArn}", - "Parameters": { - "execution_id.$": "$$.Execution.Id", - "execution_start_time.$": "$$.Execution.StartTime", - "input.$": "$.input" - }, - "Next": "Evaluate Payload" - }, - "Evaluate Payload": { - "Type": "Choice", - "Choices": [ - { - "Variable": "$.is_valid_build", - "BooleanEquals": true, - "Next": "Pass Build" - } - ], - "Default": "Fail Build" - }, - "Fail Build": { - "Type": "Pass", - "ResultPath": null, - "End": true - }, - "Pass Build": { - "Type": "Pass", - "ResultPath": null, - "End": true - } - } - } - }, - "Execution Failed": { - "Type": "Fail", - "Error": "ExecutionFailed", - "Cause": "No valid payload was generated" - }, - "Load Stage": { - "Type": "Map", - "Next": "Post-Execution Validation Query", - "MaxConcurrency": 1, - "ItemsPath": "$.validations.build", - "ResultPath": null, - "Iterator": { - "StartAt": "Skip Load?", - "States": { - "Skip Load?": { - "Type": "Choice", - "Choices": [ - { - "Variable": "$.input.SKIP_LOAD", - "StringEquals": "True", - "Next": "Skipped" - } - ], - "Default": "Load Data" - }, - "Skipped": { - "Type": "Pass", - "ResultPath": null, - "End": true - }, - "Load Data": { - "Type": "Parallel", - "ResultPath": null, - "End": true, - "Branches": [ - { - "StartAt": "Invoke Load Script", - "States": { - "Invoke Load Script": { - "Type": "Task", - "Resource": "${InvokeLoadScriptFunctionArn}", - "InputPath": "$.input", - "ResultPath": null, - "End": true - } - } - }, - { - "StartAt": "Load GFEs", - "States": { - "Load GFEs": { - "Type": "Task", - "Resource": "${LoadNeo4jActivityArn}", - "HeartbeatSeconds": 60, - "InputPath": "$.input", - "ResultPath": null, - "End": true, - "Catch": [ - { - "ErrorEquals": [ - "States.ALL" - ], - "Next": "Fail Load" - } - ] - }, - "Fail Load": { - "Type": "Pass", - "ResultPath": null, - "End": true - } - } - } - ] - } - } - } - }, - "Post-Execution Validation Query": { - "Type": "Task", - "Resource": "${ExecuteValidationQueriesFunctionArn}", - "ResultPath": "$.validations.queries.post", - "Next": "Post-Execution Backup" - }, - "Post-Execution Backup": { - "Type": "Task", - "Resource": "${InvokeBackupScriptFunctionArn}", - "ResultPath": "$.backups.post", - "End": true - } - } -} \ No newline at end of file diff --git a/gfe-db/pipeline/statemachines/update-pipeline.asl.json b/gfe-db/pipeline/statemachines/update-pipeline.asl.json new file mode 100644 index 00000000..78bfe7ca --- /dev/null +++ b/gfe-db/pipeline/statemachines/update-pipeline.asl.json @@ -0,0 +1,630 @@ +{ + "StartAt": "Pre-Execution Tasks", + "States": { + "Pre-Execution Tasks": { + "Type": "Parallel", + "OutputPath": "$.[0]", + "Next": "Update Status → BUILD_IN_PROGRESS", + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Handle Error" + } + ], + "Branches": [ + { + "StartAt": "Run Pre-Execution Validation Query", + "States": { + "Run Pre-Execution Validation Query": { + "Type": "Task", + "Resource": "${ExecuteValidationQueriesFunctionArn}", + "ResultPath": "$.validations.queries.pre", + "Next": "Get Execution State" + }, + "Get Execution State": { + "Type": "Task", + "Resource": "${GetExecutionStateFunctionArn}", + "ResultPath": "$.state", + "End": true + } + } + } + ] + }, + "Update Status → BUILD_IN_PROGRESS": { + "Type": "Pass", + "Result": "BUILD_IN_PROGRESS", + "ResultPath": "$.state.execution.status", + "Next": "Build Stage" + }, + "Build Stage": { + "Type": "Parallel", + "OutputPath": "$.[1]", + "Next": "Skip Load❓", + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Handle Error" + } + ], + "Branches": [ + { + "StartAt": "Sync Status → BUILD_IN_PROGRESS", + "States": { + "Sync Status → BUILD_IN_PROGRESS": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__date_utc = :execution_date_utc, execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":execution_date_utc": { + "S.$": "$$.Execution.StartTime" + }, + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + } + } + }, + { + "StartAt": "Check Existing Build", + "States": { + "Check Existing Build": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:s3:listObjects", + "Parameters": { + "Bucket": "${DataBucketName}", + "Prefix.$": "States.Format('data/{}/csv/', $.input.version)" + }, + "ResultPath": "$.validations.check_existing_build", + "Next": "Files Exist❓" + }, + "Files Exist❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.validations.check_existing_build.Contents", + "IsPresent": false, + "Next": "Generate CSV Files" + } + ], + "Default": "Use Existing Build❓" + }, + "Use Existing Build❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.input.input_parameters.use_existing_build", + "BooleanEquals": false, + "Next": "Generate CSV Files" + } + ], + "Default": "Validate Build 🧐" + }, + "Generate CSV Files": { + "Type": "Task", + "Resource": "arn:aws:states:::batch:submitJob.sync", + "ResultPath": null, + "Next": "Validate Build 🧐", + "Parameters": { + "JobDefinition": "${BuildJobDefinition}", + "JobName": "${BuildJobName}", + "JobQueue": "${BuildJobQueue}", + "ContainerOverrides": { + "Environment": [ + { + "Name": "EVENT", + "Value.$": "States.JsonToString($)" + } + ] + }, + "Tags": { + "Key": "Name", + "Value.$": "States.Format('${Stage}-${AppName}-worker-{}', $.input.version)" + } + } + }, + "Validate Build 🧐": { + "Type": "Task", + "Resource": "${ValidateBuildOutputFunctionArn}", + "Parameters": { + "execution_id.$": "$$.Execution.Id", + "execution_start_time.$": "$$.Execution.StartTime", + "input.$": "$.input" + }, + "ResultPath": "$.validations.build_outputs", + "Next": "Is Build Successful❓" + }, + "Is Build Successful❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.validations.build_outputs.is_valid_build", + "BooleanEquals": true, + "Next": "Update Status → BUILD_SUCCESS" + } + ], + "Default": "Update Status → BUILD_FAILURE" + }, + "Update Status → BUILD_SUCCESS": { + "Type": "Pass", + "Result": "BUILD_SUCCESS", + "ResultPath": "$.state.execution.status", + "Next": "Sync Status → BUILD_SUCCESS" + }, + "Sync Status → BUILD_SUCCESS": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + }, + "Update Status → BUILD_FAILURE": { + "Type": "Pass", + "Result": "BUILD_FAILURE", + "ResultPath": "$.state.execution.status", + "Next": "Sync Status → BUILD_FAILURE" + }, + "Sync Status → BUILD_FAILURE": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "Next": "Fail Build", + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + }, + "Fail Build": { + "Type": "Fail", + "Error": "BuildFailed", + "CausePath": "States.JsonToString($.validations.build_outputs.errors)" + } + } + } + ] + }, + "Skip Load❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.input.input_parameters.skip_load", + "BooleanEquals": true, + "Next": "Handle Skipped Load" + } + ], + "Default": "Update Status → LOAD_IN_PROGRESS" + }, + "Handle Skipped Load": { + "Type": "Parallel", + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Handle Error" + } + ], + "Branches": [ + { + "StartAt": "Update Status → LOAD_SKIPPED", + "States": { + "Update Status → LOAD_SKIPPED": { + "Type": "Pass", + "Result": "LOAD_SKIPPED", + "ResultPath": "$.state.execution.status", + "Next": "Sync Status → LOAD_SKIPPED" + }, + "Sync Status → LOAD_SKIPPED": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + } + } + } + ], + "Next": "Format Results" + }, + "Update Status → LOAD_IN_PROGRESS": { + "Type": "Pass", + "Result": "LOAD_IN_PROGRESS", + "ResultPath": "$.state.execution.status", + "Next": "Load Stage" + }, + "Load Stage": { + "Type": "Parallel", + "OutputPath": "$.[1]", + "Next": "Format Results", + "Catch": [ + { + "ErrorEquals": [ + "States.ALL" + ], + "Next": "Handle Error" + } + ], + "Branches": [ + { + "StartAt": "Sync Status → LOAD_IN_PROGRESS", + "States": { + "Sync Status → LOAD_IN_PROGRESS": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + } + } + }, + { + "StartAt": "Append Execution Id", + "States": { + "Append Execution Id": { + "Type": "Pass", + "Parameters": { + "input.$": "$.input", + "validations.$": "$.validations", + "state.$": "$.state", + "execution_arn.$": "$$.Execution.Id" + }, + "ResultPath": "$", + "Next": "Queue Load" + }, + "Queue Load": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:sqs:sendMessage", + "Parameters": { + "QueueUrl": "${GfeDbLoadQueueUrl}", + "MessageGroupId": "${Stage}-${AppName}", + "MessageDeduplicationId.$": "States.Format('{}', $.input.version)", + "MessageBody.$": "$" + }, + "ResultPath": null, + "Next": "Wait for Load Status" + }, + "Wait for Load Status": { + "Type": "Wait", + "Seconds": 60, + "Next": "Check Status" + }, + "Check Status": { + "Type": "Task", + "Resource": "${GetExecutionStateFunctionArn}", + "ResultPath": "$.state", + "Next": "Is Load Complete❓" + }, + "Is Load Complete❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.state.execution.status", + "StringEquals": "LOAD_COMPLETE", + "Next": "Run Validation Query" + }, + { + "Variable": "$.state.execution.status", + "StringEquals": "LOAD_FAILED", + "Next": "Fail Load" + } + ], + "Default": "Wait for Load Status" + }, + "Fail Load": { + "Type": "Fail", + "ErrorPath": "States.JsonToString($.state.error.message)", + "CausePath": "States.JsonToString($.state.error.cause)" + }, + "Run Validation Query": { + "Type": "Task", + "Resource": "${ExecuteValidationQueriesFunctionArn}", + "ResultPath": "$.validations.queries.post", + "Next": "Evaluate Query Results", + "Retry": [ + { + "ErrorEquals": ["States.ALL"], + "IntervalSeconds": 2, + "MaxAttempts": 3, + "BackoffRate": 2 + } + ] + }, + "Evaluate Query Results": { + "Type": "Task", + "Resource": "${EvaluateQueryResultsFunctionArn}", + "ResultPath": "$.validations.load_results", + "Next": "Is Load Valid❓" + }, + "Is Load Valid❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.validations.load_results.is_load_successful.value", + "BooleanEquals": true, + "Next": "Update Status → LOAD_SUCCESS" + } + ], + "Default": "Update Status → LOAD_INVALID" + }, + "Update Status → LOAD_SUCCESS": { + "Type": "Pass", + "Result": "LOAD_SUCCESS", + "ResultPath": "$.state.execution.status", + "Next": "Sync Status → LOAD_SUCCESS" + }, + "Sync Status → LOAD_SUCCESS": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + }, + "Update Status → LOAD_INVALID": { + "Type": "Pass", + "Result": "LOAD_INVALID", + "ResultPath": "$.state.execution.status", + "Next": "Sync Status → LOAD_INVALID" + }, + "Sync Status → LOAD_INVALID": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "Next": "Validation Failed", + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + } + } + } + }, + "Validation Failed": { + "Type": "Fail", + "Error": "LoadValidationFailed", + "CausePath": "States.JsonToString($.validations.load_results.errors)" + } + } + } + ] + }, + "Handle Error": { + "Type": "Parallel", + "OutputPath": "$.[0]", + "Next": "Format Results", + "Branches": [ + { + "StartAt": "Get Current Status", + "States": { + "Get Current Status": { + "Type": "Task", + "Resource": "${GetExecutionStateFunctionArn}", + "ResultPath": "$.state", + "Parameters": { + "input.$": "$$.Execution.Input.input" + }, + "Next": "Is Status Current❓" + }, + "Is Status Current❓": { + "Type": "Choice", + "Choices": [ + { + "Or": [ + { + "Variable": "$.state.execution.status", + "StringEquals": "BUILD_FAILED" + }, + { + "Variable": "$.state.execution.status", + "StringEquals": "LOAD_FAILED" + } + ], + "Next": "Sync Error Status" + } + ], + "Default": "Update Status → EXECUTION_FAILED" + }, + "Update Status → EXECUTION_FAILED": { + "Type": "Pass", + "Result": "EXECUTION_FAILED", + "ResultPath": "$.state.execution.status", + "Next": "Sync Error Status" + }, + "Sync Error Status": { + "Type": "Task", + "Resource": "arn:aws:states:::dynamodb:updateItem", + "ResultPath": null, + "End": true, + "Parameters": { + "TableName": "${GfeDbExecutionStateTable}", + "Key": { + "commit__sha": { + "S.$": "$$.Execution.Input.input.commit_sha" + }, + "execution__version": { + "N.$": "States.Format('{}', $$.Execution.Input.input.version)" + } + }, + "UpdateExpression": "SET execution__status = :status, updated_utc = :updated_utc, error__message = :error, error__cause = :cause", + "ExpressionAttributeValues": { + ":status": { + "S.$": "$.state.execution.status" + }, + ":updated_utc": { + "S.$": "$$.State.EnteredTime" + }, + ":error": { + "S.$": "$.Error" + }, + ":cause": { + "S.$": "$.Cause" + } + } + } + } + } + } + ] + }, + "Format Results": { + "Type": "Task", + "Resource": "${FormatResultsFunctionArn}", + "ResultPath": "$.report", + "Next": "Publish Report 📡" + }, + "Publish Report 📡": { + "Type": "Task", + "Resource": "arn:aws:states:::sns:publish", + "ResultPath": null, + "Parameters": { + "TopicArn": "${GfeDbExecutionResultTopicArn}", + "Message.$": "$.report" + }, + "Next": "Success or Failure❓" + }, + "Success or Failure❓": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.Error", + "IsPresent": true, + "Next": "Pipeline Failure 😖" + } + ], + "Default": "Pipeline Success 😁" + }, + "Pipeline Failure 😖": { + "Type": "Fail", + "ErrorPath": "$.Error", + "CausePath": "$.Cause" + }, + "Pipeline Success 😁": { + "Type": "Succeed" + } + } +} \ No newline at end of file diff --git a/gfe-db/pipeline/template.yaml b/gfe-db/pipeline/template.yaml index 8ca723b6..c148a514 100644 --- a/gfe-db/pipeline/template.yaml +++ b/gfe-db/pipeline/template.yaml @@ -1,7 +1,6 @@ AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 Description: Deploys IAM, ECR repositories, AWS Batch Jobs and a State Machine for the gfe-db update pipeline - Parameters: Stage: Type: String @@ -14,83 +13,145 @@ Parameters: # AllowedValues: # - "true" # - "false" - usePrivateSubnet: - Type: String - AllowedValues: - - "true" - - "false" - GitHubRepositoryOwner: - Type: String - GitHubRepositoryName: - Type: String - InvokePipelineFunctionName: - Type: String - AllowedPattern: ^[a-z-]+$ - InvokePipelineFunctionSchedule: - Type: String - AllowedPattern: ^cron\(.*\)$ - InvokePipelineFunctionMemorySize: - Type: Number - Default: 256 - AllowedValues: - - 128 - - 256 - - 512 - InvokePipelineFunctionTimeout: - Type: Number - Default: 60 - PipelineStatePath: - Type: String - PipelineParamsPath: - Type: String - ExecuteValidationQueriesFunctionName: + # usePrivateSubnet: + # Type: String + # AllowedValues: + # - "true" + # - "false" + ServiceName: Type: String - AllowedPattern: ^[a-z-]+$ - MaxLength: 140 - ExecuteValidationQueriesFunctionMemorySize: - Type: Number - Default: 256 - AllowedValues: - - 128 - - 256 - - 512 - ExecuteValidationQueriesFunctionTimeout: - Type: Number - InvokeBackupScriptFunctionName: + Description: Name of the service + ConfigS3Path: Type: String - AllowedPattern: ^[a-z-]+$ - MaxLength: 140 - ValidateBuildOutputFunctionName: + Description: S3 path to config file + GitHubRepositoryOwner: Type: String - AllowedPattern: ^[a-z-]+$ - MaxLength: 140 - InvokeLoadScriptFunctionName: + GitHubRepositoryName: Type: String - AllowedPattern: ^[a-z-]+$ - MaxLength: 140 - DisableBackupFunctionName: + GitHubPersonalAccessToken: Type: String - AllowedPattern: ^[a-z-]+$ - MaxLength: 140 + NoEcho: true ECRBaseUri: Type: String BuildServiceRepositoryName: Type: String FeatureServiceUrl: Type: String - Ec2KeyPairName: - Type: String + # Ec2KeyPairName: + # Type: String +# Conditions: +# UsePrivateSubnet: !Equals +# - !Ref usePrivateSubnet +# - "true" +Resources: + # TODO duplicate in infrastructure layer + GitHubPersonalAccessTokenSecret: + Type: AWS::SecretsManager::Secret + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GitHubPersonalAccessToken + Description: GitHub Personal Access Token for repository access + SecretString: !Ref GitHubPersonalAccessToken + Tags: + - Key: Stage + Value: !Ref Stage + - Key: AppName + Value: !Ref AppName + GitHubSourceRepositoryParameter: + Type: AWS::SSM::Parameter + Properties: + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GitHubSourceRepository + Type: String + Description: GitHub Personal Access Token for repository access + Value: !Sub '{"owner":"${GitHubRepositoryOwner}", "name":"${GitHubRepositoryName}"}' + GfeDbModelsLayer: + Type: AWS::Serverless::LayerVersion + Properties: + LayerName: !Sub ${Stage}-gfe-db-models + Description: !Sub ${AppName} models for validationa and processing data + ContentUri: lambda_layers/gfe_db_models + RetentionPolicy: delete + CompatibleRuntimes: + - python3.9 + - python3.10 + Metadata: + BuildMethod: python3.10 + BuildArchitectures: arm64 + GfeDbExecutionStateTable: + Type: AWS::DynamoDB::Table + UpdateReplacePolicy: Retain + DeletionPolicy: Delete + Properties: + AttributeDefinitions: + - AttributeName: commit__sha + AttributeType: S + - AttributeName: execution__version + AttributeType: "N" + KeySchema: + - AttributeName: commit__sha + KeyType: HASH + - AttributeName: execution__version + KeyType: RANGE + BillingMode: PAY_PER_REQUEST + GfeDbExecutionStateTableNameParameter: + Type: AWS::SSM::Parameter + Properties: + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfeDbExecutionStateTableName + Type: String + Value: !Ref GfeDbExecutionStateTable + GfeDbExecutionStateTableFieldsParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfeDbExecutionStateTableFields + Description: !Sub Selected fields for ${Stage}-${AppName} execution state table + Tier: Standard + Value: !Sub | + [ + "commit.sha", + "execution.version", + "commit.date_utc", + "commit.html_url", + "commit.message", + "execution.invocation_id", + "execution.id", + "execution.date_utc", + "execution.status", + "execution.input_parameters.align", + "execution.input_parameters.kir", + "execution.input_parameters.limit", + "execution.input_parameters.mem_profile", + "repository.name", + "repository.owner", + "repository.url", + "error.message", + "error.cause", + "created_utc", + "updated_utc" + ] + # BuildServerSG: + # Type: AWS::EC2::SecurityGroup + # Properties: + # GroupName: !Sub ${Stage}-${AppName}-build-server-sg + # GroupDescription: !Sub Security group for the ${Stage}-${AppName} build server + # VpcId: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/VpcId}}" + # Tags: + # - Key: Name + # Value: !Sub ${Stage}-${AppName}-build-server-sg + # Ec2KeyPairName: + # Type: String -Conditions: +# Conditions: # Important: All stacks (infra, database, pipeline) should use the same conditions # CreateVpc: !Equals [!Ref createVpc, 'true'] # UseExternalVpc: !Equals [!Ref createVpc, 'false'] # UseExternalVpcPublic: !And [!Equals [!Ref createVpc, 'false'], !Equals [!Ref usePrivateSubnet, 'false']] # UseExternalVpcPrivate: !And [!Equals [!Ref createVpc, 'false'], !Equals [!Ref usePrivateSubnet, 'true']] - UsePrivateSubnet: !Equals [!Ref usePrivateSubnet, 'true'] + # UsePrivateSubnet: !Equals [!Ref usePrivateSubnet, 'true'] # UsePublicSubnet: !Equals [!Ref usePrivateSubnet, 'false'] -Resources: +# Resources: # BuildServerSG: # Type: AWS::EC2::SecurityGroup # Properties: @@ -147,36 +208,37 @@ Resources: - arn:aws:iam::aws:policy/AmazonS3FullAccess - arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy Policies: - - PolicyName: !Sub '${Stage}-${AppName}-ECS-CloudWatchLogs' - PolicyDocument: + - PolicyName: !Sub ${Stage}-${AppName}-ECS-CloudWatchLogs + PolicyDocument: Version: "2012-10-17" Statement: - Action: - - "logs:CreateLogGroup" - - "logs:CreateLogStream" - - "logs:PutLogEvents" - - "logs:DescribeLogStreams" - Effect: "Allow" - Resource: "arn:aws:logs:*:*:*" - - PolicyName: !Sub '${Stage}-${AppName}-BuildServiceS3ReadWritePolicy' + - logs:CreateLogGroup + - logs:CreateLogStream + - logs:PutLogEvents + - logs:DescribeLogStreams + Effect: Allow + Resource: arn:aws:logs:*:*:* + - PolicyName: !Sub ${Stage}-${AppName}-BuildServiceS3ReadWritePolicy PolicyDocument: Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "s3:GetObject" - - "s3:ListBucket" - - "s3:GetBucketLocation" - - "s3:GetObjectVersion" - - "s3:GetLifecycleConfiguration" - - "s3:PutObject" - - "s3:PutObjectAcl" - - "s3:PutLifecycleConfiguration" - - "s3:DeleteObject" + - Effect: Allow + Action: + - s3:GetObject + - s3:ListBucket + - s3:GetBucketLocation + - s3:GetObjectVersion + - s3:GetLifecycleConfiguration + - s3:PutObject + - s3:PutObjectAcl + - s3:PutLifecycleConfiguration + - s3:DeleteObject Resource: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}' - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*' - - PolicyName: !Sub '${Stage}-${AppName}-SecretsPolicy' + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}" + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*" + # TODO separate into specific policies for SecretsManager and SSM + - PolicyName: !Sub ${Stage}-${AppName}-SecretsPolicy PolicyDocument: Version: "2012-10-17" Statement: @@ -210,39 +272,37 @@ Resources: Type: AWS::Batch::JobDefinition Properties: Type: container - JobDefinitionName: !Sub '${Stage}-${AppName}-BuildJobDefinition' + JobDefinitionName: !Sub ${Stage}-${AppName}-BuildJobDefinition ContainerProperties: - Image: !Sub '${ECRBaseUri}/${BuildServiceRepositoryName}:latest' + Image: !Sub ${ECRBaseUri}/${BuildServiceRepositoryName}:latest Vcpus: 8 - # TODO: Memory param is deprecated, move to ResourceRequirements - Memory: 8000 # Keep this around half the available RAM to avoid getting stuck in RUNNABLE status - # ResourceRequirements: - # Type: MEMORY - # Value: 8000 + Memory: 8000 Command: - bash - run.sh Environment: - Name: GFE_BUCKET - Value: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}' + Value: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}" - Name: FEATURE_SERVICE_URL Value: !Ref FeatureServiceUrl - # - Name: FAILED_ALLELES_QUEUE - # Value: !Ref FailedAllelesQueue - Name: AWS_REGION Value: !Ref AWS::Region RetryStrategy: Attempts: 1 - BuildJobQueue: Type: AWS::Batch::JobQueue Properties: - JobQueueName: !Sub '${Stage}-${AppName}-BuildJobQueue' + JobQueueName: !Sub ${Stage}-${AppName}-BuildJobQueue Priority: 1 ComputeEnvironmentOrder: - Order: 1 ComputeEnvironment: !Ref BuildComputeEnvironment - + BuildJobQueueArn: + Type: AWS::SSM::Parameter + Properties: + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/BuildJobQueueArn + Type: String + Value: !GetAtt BuildJobQueue.JobQueueArn BuildComputeEnvironment: Type: AWS::Batch::ComputeEnvironment Properties: @@ -250,8 +310,8 @@ Resources: ComputeResources: Type: EC2 AllocationStrategy: BEST_FIT_PROGRESSIVE - # # Testing only, comment before production deployment - Ec2KeyPair: !Ref Ec2KeyPairName + # # # Testing only, comment before production deployment + # Ec2KeyPair: !Ref Ec2KeyPairName MinvCpus: 0 DesiredvCpus: 0 MaxvCpus: 32 @@ -264,16 +324,16 @@ Resources: InstanceRole: !Ref BatchWorkerInstanceProfile LaunchTemplate: LaunchTemplateId: !Ref BuildLaunchTemplate - Tags: { "Name": "gfe-db-build-worker" } + Tags: + Name: !Sub "${Stage}-${AppName}-build-worker" ServiceRole: !GetAtt BatchServiceRole.Arn - BuildLaunchTemplate: Type: AWS::EC2::LaunchTemplate Properties: LaunchTemplateData: DisableApiTermination: false InstanceInitiatedShutdownBehavior: stop - ImageId: '{{resolve:ssm:/aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id}}' + ImageId: "{{resolve:ssm:/aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id}}" InstanceType: c5d.xlarge IamInstanceProfile: Name: !Ref BatchWorkerInstanceProfile @@ -304,10 +364,9 @@ Resources: --==MYBOUNDARY== TagSpecifications: - ResourceType: launch-template - Tags: + Tags: - Key: Name - Value: 'gfe-db-build-worker' - + Value: !Sub "${Stage}-${AppName}-build-worker" BatchTaskExecutionRole: Type: AWS::IAM::Role Properties: @@ -320,20 +379,20 @@ Resources: Service: ecs-tasks.amazonaws.com Action: sts:AssumeRole Policies: - - PolicyName: !Sub '${Stage}-${AppName}-ecsTaskExecutionRolePolicy' - PolicyDocument: + - PolicyName: !Sub ${Stage}-${AppName}-ecsTaskExecutionRolePolicy + PolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Action: - - "ecr:GetAuthorizationToken" - - "ecr:BatchCheckLayerAvailability" - - "ecr:GetDownloadUrlForLayer" - - "ecr:BatchGetImage" - - "logs:CreateLogStream" - - "logs:PutLogEvents" + - ecr:GetAuthorizationToken + - ecr:BatchCheckLayerAvailability + - ecr:GetDownloadUrlForLayer + - ecr:BatchGetImage + - logs:CreateLogStream + - logs:PutLogEvents Resource: "*" - - PolicyName: !Sub '${Stage}-${AppName}-BatchTaskExecutionPolicy' + - PolicyName: !Sub ${Stage}-${AppName}-BatchTaskExecutionPolicy PolicyDocument: Version: "2012-10-17" Statement: @@ -359,156 +418,222 @@ Resources: RepositoryPolicyText: Version: "2012-10-17" Statement: - - Sid: "AllowPushPull" - Effect: "Allow" + - Sid: AllowPushPull + Effect: Allow Principal: AWS: - - !Sub 'arn:aws:iam::${AWS::AccountId}:role/${BatchWorkerInstanceRole}' + - !Sub arn:aws:iam::${AWS::AccountId}:role/${BatchWorkerInstanceRole} Action: - - "ecr:GetDownloadUrlForLayer" - - "ecr:BatchGetImage" - - "ecr:BatchCheckLayerAvailability" - - "ecr:PutImage" - - "ecr:InitiateLayerUpload" - - "ecr:UploadLayerPart" - - "ecr:CompleteLayerUpload" - + - ecr:GetDownloadUrlForLayer + - ecr:BatchGetImage + - ecr:BatchCheckLayerAvailability + - ecr:PutImage + - ecr:InitiateLayerUpload + - ecr:UploadLayerPart + - ecr:CompleteLayerUpload BuildServiceRepositoryNameParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/BuildServiceRepositoryName' - Description: "Name of gfe-db build service repository" + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/BuildServiceRepositoryName + Description: !Sub Name of ${AppName} build service repository Value: !Ref BuildServiceRepositoryName + GfeDbProcessingQueue: + Type: AWS::SQS::Queue + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + FifoQueue: true + VisibilityTimeout: 300 + MessageRetentionPeriod: 43200 + ReceiveMessageWaitTimeSeconds: 20 + RedrivePolicy: + deadLetterTargetArn: !GetAtt GfeDbProcessingDeadLetterQueue.Arn + maxReceiveCount: 1 + GfeDbProcessingDeadLetterQueue: + Type: AWS::SQS::Queue + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + FifoQueue: true + VisibilityTimeout: 43200 + MessageRetentionPeriod: 1209600 + ReceiveMessageWaitTimeSeconds: 10 + GfeDbProcessingQueueUrlParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfeDbProcessingQueueUrl + Description: URL of gfe-db processing queue + Value: !GetAtt GfeDbProcessingQueue.QueueUrl + + - InvokePipelineFunction: + # Controls load concurrency + GfeDbLoadQueue: + Type: AWS::SQS::Queue + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + FifoQueue: true + VisibilityTimeout: 10 + MessageRetentionPeriod: 1209600 + ReceiveMessageWaitTimeSeconds: 0 # Short polling can be used since there is 1 consumer + RedrivePolicy: + deadLetterTargetArn: !GetAtt GfeDbLoadDeadLetterQueue.Arn + maxReceiveCount: 1 + GfeDbLoadDeadLetterQueue: + Type: AWS::SQS::Queue + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + FifoQueue: true + VisibilityTimeout: 60 + MessageRetentionPeriod: 1209600 + ReceiveMessageWaitTimeSeconds: 10 + GfeDbLoadQueueUrlParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfeDbLoadQueueUrl + Description: URL of gfe-db load queue + Value: !GetAtt GfeDbLoadQueue.QueueUrl + CheckSourceUpdateFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref InvokePipelineFunctionName - Description: !Sub '${AppName} update pipeline trigger: checks for new IMGT/HLA releases and starts the loading process' - CodeUri: functions/invoke_pipeline/ + FunctionName: !Sub ${Stage}-${AppName}-check-source-update + Description: !Sub "${AppName} update pipeline trigger: checks for new IMGT/HLA releases and starts the loading process" + CodeUri: functions/check_source_update/ Handler: app.lambda_handler - Runtime: python3.11 - MemorySize: !Ref InvokePipelineFunctionMemorySize - Timeout: !Ref InvokePipelineFunctionTimeout + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 256 + Timeout: 300 Architectures: - x86_64 Environment: Variables: - GITHUB_PERSONAL_ACCESS_TOKEN: !Sub '{{resolve:secretsmanager:${AppName}-${Stage}-GitHubPersonalAccessToken:SecretString:personal_access_token:AWSCURRENT}}' - GITHUB_REPOSITORY_OWNER: !Ref GitHubRepositoryOwner - GITHUB_REPOSITORY_NAME: !Ref GitHubRepositoryName - DATA_BUCKET_NAME: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}' - UPDATE_PIPELINE_STATE_MACHINE_ARN: !GetAtt UpdatePipelineStateMachine.Arn - PIPELINE_STATE_PATH: !Ref PipelineStatePath - PIPELINE_PARAMS_PATH: !Ref PipelineParamsPath + APP_NAME: !Ref AppName + STAGE: !Ref Stage + PIPELINE_SOURCE_CONFIG_S3_PATH: !Sub ${ConfigS3Path}/${ServiceName}/source-config.json Events: Trigger: Type: Schedule Properties: - Schedule: !Ref InvokePipelineFunctionSchedule + Schedule: cron(0\ 12\ *\ *\ ?\ *) Policies: - AWSLambdaBasicExecutionRole - Version: "2012-10-17" Statement: - - Sid: "AllowS3Access" - Effect: "Allow" + - Sid: AllowSecretsManagerAccess + Effect: Allow + Action: + - secretsmanager:GetSecretValue + Resource: !Ref GitHubPersonalAccessTokenSecret + - Version: "2012-10-17" + Statement: + - Sid: AllowSSMParameterStoreAccess + Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Version: "2012-10-17" + Statement: + - Sid: AllowDynamoDBReadAccess + Effect: Allow + Action: + - dynamodb:Scan + - dynamodb:BatchWriteItem + Resource: !GetAtt GfeDbExecutionStateTable.Arn + - Version: "2012-10-17" + Statement: + - Sid: AllowSQSAccess + Effect: Allow + Action: + - sqs:SendMessage + - sqs:GetQueueUrl + - sqs:GetQueueAttributes + - sqs:SendMessageBatch + # TODO tighten permissions + Resource: "*" + - Version: "2012-10-17" + Statement: + - Sid: AllowS3Access + Effect: Allow Action: - - "s3:GetObject" - - "s3:PutObject" - - "s3:ListBucket" + - s3:GetObject + - s3:ListBucket + - s3:GetBucketLocation + - s3:GetObjectVersion Resource: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}' - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*' - - Sid: "AllowSFNStartExecution" - Effect: "Allow" + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}" + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*" + - Sid: AllowSFNStartExecution + Effect: Allow Action: - - "states:StartExecution" + - states:StartExecution Resource: !GetAtt UpdatePipelineStateMachine.Arn - - InvokePipelineFunctionAlarm: - Type: AWS::CloudWatch::Alarm - Properties: - AlarmDescription: !Sub 'Alarm for ${InvokePipelineFunctionName} function errors' - ActionsEnabled: true - AlarmActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineErrorsTopicArn}}' - MetricName: Errors - Namespace: AWS/Lambda - Statistic: Sum - Period: 86400 - EvaluationPeriods: 1 - Threshold: 1 - ComparisonOperator: GreaterThanOrEqualToThreshold - Dimensions: - - Name: FunctionName - Value: !Ref InvokePipelineFunctionName - ExecuteValidationQueriesFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref ExecuteValidationQueriesFunctionName - Description: !Sub "Execute validation a queries on Neo4j for ${AppName}" + FunctionName: !Sub ${Stage}-${AppName}-execute-validation-queries + Description: !Sub Execute validation a queries on Neo4j for ${AppName} CodeUri: functions/execute_validation_queries/ Handler: app.lambda_handler + MemorySize: 256 + Timeout: 60 Runtime: python3.11 - MemorySize: !Ref ExecuteValidationQueriesFunctionMemorySize - Timeout: !Ref ExecuteValidationQueriesFunctionTimeout Architectures: - x86_64 Environment: Variables: STAGE: !Ref Stage APP_NAME: !Ref AppName - VpcConfig: !If - - UsePrivateSubnet - - SubnetIds: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/PrivateSubnetId}}' - SecurityGroupIds: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId}}' - - !Ref AWS::NoValue + VpcConfig: + SubnetIds: + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/PrivateSubnetId}}" + SecurityGroupIds: + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseSecurityGroupId}}" # If UsePrivateSubnet is true, attach AWSLambdaVPCAccessExecutionRole, otherwise attach AWSLambdaBasicExecutionRole Policies: - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "ssm:GetParameters" - - "ssm:GetParameter" - Resource: - - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/*' - - Effect: "Allow" + - Effect: Allow Action: - - "secretsmanager:GetSecretValue" - - "secretsmanager:DescribeSecret" - - "secretsmanager:ListSecretVersionIds" - - "secretsmanager:ListSecrets" + - ssm:GetParameters + - ssm:GetParameter Resource: - - !Sub 'arn:${AWS::Partition}:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:/${AppName}/${Stage}/${AWS::Region}/Neo4jCredentials*' + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Effect: Allow + Action: + - secretsmanager:GetSecretValue + - secretsmanager:DescribeSecret + - secretsmanager:ListSecretVersionIds + - secretsmanager:ListSecrets + Resource: + - !Sub arn:${AWS::Partition}:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:/${AppName}/${Stage}/${AWS::Region}/Neo4jCredentials* - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: !If - - UsePrivateSubnet - - - "logs:CreateLogGroup" - - "logs:CreateLogStream" - - "logs:PutLogEvents" - - "ec2:CreateNetworkInterface" - - "ec2:DescribeNetworkInterfaces" - - "ec2:DeleteNetworkInterface" - - "ec2:AssignPrivateIpAddresses" - - "ec2:UnassignPrivateIpAddresses" - - - "logs:CreateLogGroup" - - "logs:CreateLogStream" - - "logs:PutLogEvents" - Resource: + - Effect: Allow + Action: + - logs:CreateLogGroup + - logs:CreateLogStream + - logs:PutLogEvents + - ec2:CreateNetworkInterface + - ec2:DescribeNetworkInterfaces + - ec2:DeleteNetworkInterface + - ec2:AssignPrivateIpAddresses + - ec2:UnassignPrivateIpAddresses + Resource: - "*" - InvokeBackupScriptFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref InvokeBackupScriptFunctionName - Description: !Sub "Invoke backup for Neo4j for ${AppName}" + FunctionName: !Sub ${Stage}-${AppName}-invoke-backup-script + Description: !Sub Invoke backup for Neo4j for ${AppName} CodeUri: functions/invoke_backup_script/ Handler: app.lambda_handler Runtime: python3.11 @@ -524,10 +649,11 @@ Resources: - AWSLambdaBasicExecutionRole - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "ssm:SendCommand" - - "ssm:GetDocument" + - Effect: Allow + Action: + - ssm:SendCommand + - ssm:GetDocument + # TODO tighten permissions Resource: - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:document/${Stage}-${AppName}-database-Neo4jBackupDocument*' - !Sub 'arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseInstanceId}}' @@ -543,67 +669,179 @@ Resources: - "ssm:GetCommandInvocation" Resource: - '*' - - InvokeLoadScriptFunction: + GetExecutionStateFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref InvokeLoadScriptFunctionName - Description: !Sub "Invoke server-side loading on Neo4j for ${AppName}" - CodeUri: functions/invoke_load_script/ + # FunctionName: !Sub ${Stage}-${AppName}-get-execution-state + Description: !Sub ${AppName} execution input validation and state retrieval + CodeUri: functions/get_execution_state/ Handler: app.lambda_handler - Runtime: python3.11 + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer MemorySize: 256 Timeout: 60 Architectures: - x86_64 Environment: Variables: - NEO4J_LOAD_QUERY_DOCUMENT_NAME_SSM_PARAM: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jLoadQueryDocumentName' - NEO4J_DATABASE_INSTANCE_ID_SSM_PARAM: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseInstanceId' - LOAD_NEO4J_ACTIVITY: !Ref LoadNeo4jActivity APP_NAME: !Ref AppName + STAGE: !Ref Stage Policies: - AWSLambdaBasicExecutionRole - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "ssm:SendCommand" - - "ssm:GetDocument" + - Sid: AllowSSMParameterStoreAccess + Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: !Sub arn:aws:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Version: "2012-10-17" + Statement: + - Sid: AllowDynamoDBReadAccess + Effect: Allow + Action: + - dynamodb:GetItem + Resource: !GetAtt GfeDbExecutionStateTable.Arn + UpdateExecutionStateFunction: + Type: AWS::Serverless::Function + Properties: + # FunctionName: !Sub ${Stage}-${AppName}-update-execution-state + Description: !Sub ${AppName} execution input validation and state retrieval + CodeUri: functions/update_execution_state/ + Handler: app.lambda_handler + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 256 + Timeout: 60 + Architectures: + - x86_64 + Environment: + Variables: + APP_NAME: !Ref AppName + STAGE: !Ref Stage + Policies: + - AWSLambdaBasicExecutionRole + - Version: "2012-10-17" + Statement: + - Sid: AllowSSMParameterStoreAccess + Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: !Sub arn:aws:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Version: "2012-10-17" + Statement: + - Sid: AllowDynamoDBReadWriteAccess + Effect: Allow + Action: + - dynamodb:PutItem + - dynamodb:UpdateItem + Resource: !GetAtt GfeDbExecutionStateTable.Arn + Events: + StepFunctionsExecutionAborted: + Type: EventBridgeRule + Properties: + Pattern: + source: + - aws.states + detail-type: + - Step Functions Execution Status Change + detail: + status: + - ABORTED + stateMachineArn: + - !GetAtt UpdatePipelineStateMachine.Arn + EventBusName: default + InvokeUpdatePipelineFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: !Sub ${Stage}-${AppName}-invoke-update-pipeline + Description: !Sub Invoke the build and load processing pipeline for ${AppName} + CodeUri: functions/invoke_update_pipeline/ + Handler: app.lambda_handler + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 256 + Timeout: 60 + Architectures: + - x86_64 + Environment: + Variables: + APP_NAME: !Ref AppName + STAGE: !Ref Stage + Policies: + - AWSLambdaBasicExecutionRole + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter Resource: - - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:document/${Neo4jLoadQueryDocument}' - - !Sub 'arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseInstanceId}}' - # - !Sub 'arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/*' - - Effect: "Allow" - Action: - - "ssm:GetParameters" - - "ssm:GetParameter" - Resource: - - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/*' - - InvokeLoadScriptFunctionAlarm: - Type: AWS::CloudWatch::Alarm + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Effect: Allow + Action: + - ec2:DescribeInstanceStatus + Resource: "*" + - Effect: Allow + Action: + - states:StartExecution + Resource: + - !GetAtt UpdatePipelineStateMachine.Arn + Events: + GfeDbProcessingQueueTrigger: + Type: SQS + Properties: + Queue: !GetAtt GfeDbProcessingQueue.Arn + BatchSize: 10 + InvokeLoadConcurrencyManagerFunction: + Type: AWS::Serverless::Function Properties: - AlarmDescription: !Sub 'Alarm for ${InvokeLoadScriptFunction} function errors' - ActionsEnabled: true - AlarmActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineErrorsTopicArn}}' - MetricName: Errors - Namespace: AWS/Lambda - Statistic: Sum - Period: 86400 - EvaluationPeriods: 1 - Threshold: 1 - ComparisonOperator: GreaterThanOrEqualToThreshold - Dimensions: - - Name: FunctionName - Value: !Ref InvokeLoadScriptFunctionName - + FunctionName: !Sub ${Stage}-${AppName}-invoke-lcm + Description: !Sub Invoke the Load Concurrency Handler for ${Stage}-${AppName} + CodeUri: functions/invoke_lcm/ + Handler: app.lambda_handler + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 128 + Timeout: 10 + Architectures: + - x86_64 + Environment: + Variables: + APP_NAME: !Ref AppName + STAGE: !Ref Stage + Policies: + - AWSLambdaBasicExecutionRole + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + - Effect: Allow + Action: + - states:StartExecution + - states:ListExecutions + Resource: + - !GetAtt LoadConcurrencyManagerStateMachine.Arn + Events: + ExecutionsTrigger: + Type: SNS + Properties: + Topic: !GetAtt UpdatePipelineExecutionTopic.TopicArn ValidateBuildOutputFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref ValidateBuildOutputFunctionName - Description: !Sub "Validate build output for ${AppName}" + FunctionName: !Sub ${Stage}-${AppName}-validate-build-output + Description: !Sub Validate build output for ${AppName} CodeUri: functions/validate_build_output/ Handler: app.lambda_handler Runtime: python3.11 @@ -619,62 +857,102 @@ Resources: - AWSLambdaBasicExecutionRole - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "ssm:GetParameters" - - "ssm:GetParameter" - Resource: - - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/*' + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "s3:GetObject" - - "s3:ListBucket" - - "s3:ListObjects" - Resource: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}' - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*' - + - Effect: Allow + Action: + - s3:GetObject + - s3:ListBucket + - s3:ListObjects + Resource: + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}" + - !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketArn}}/*" + EvaluateQueryResultsFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: !Sub ${Stage}-${AppName}-evaluate-query-results + Description: !Sub Evaluate Neo4j query results ${AppName} + CodeUri: functions/evaluate_query_results/ + Handler: app.lambda_handler + Runtime: python3.10 + MemorySize: 128 + Timeout: 5 + Architectures: + - x86_64 + Environment: + Variables: + STAGE: !Ref Stage + APP_NAME: !Ref AppName + Policies: + - AWSLambdaBasicExecutionRole UpdatePipelineStateMachine: Type: AWS::Serverless::StateMachine Properties: - DefinitionUri: statemachines/pipeline.asl.json + DefinitionUri: statemachines/update-pipeline.asl.json + Logging: + Destinations: + - CloudWatchLogsLogGroup: + LogGroupArn: !GetAtt UpdatePipelineLogGroup.Arn + IncludeExecutionData: true + Level: ALL DefinitionSubstitutions: + Stage: !Ref Stage AppName: !Ref AppName - DataBucketName: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}' + InvokeBackupScriptFunctionArn: !GetAtt InvokeBackupScriptFunction.Arn + GfeDbExecutionStateTable: !Ref GfeDbExecutionStateTable + GfeDbExecutionResultTopicArn: !GetAtt GfeDbExecutionResultTopic.TopicArn + GetExecutionStateFunctionArn: !GetAtt GetExecutionStateFunction.Arn + ValidateBuildOutputFunctionArn: !GetAtt ValidateBuildOutputFunction.Arn + DataBucketName: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}" BuildJobDefinition: !Ref BuildJobDefinition - BuildJobName: !Sub '${Stage}-${AppName}-build-job' + BuildJobName: !Sub ${Stage}-${AppName}-build-job BuildJobQueue: !Ref BuildJobQueue ExecuteValidationQueriesFunctionArn: !GetAtt ExecuteValidationQueriesFunction.Arn - ValidateBuildOutputFunctionArn: !GetAtt ValidateBuildOutputFunction.Arn - InvokeLoadScriptFunctionArn: !GetAtt InvokeLoadScriptFunction.Arn - InvokeBackupScriptFunctionArn: !GetAtt InvokeBackupScriptFunction.Arn - LoadNeo4jActivityArn: !Ref LoadNeo4jActivity + EvaluateQueryResultsFunctionArn: !GetAtt EvaluateQueryResultsFunction.Arn + FormatResultsFunctionArn: !GetAtt FormatResultsFunction.Arn + GfeDbLoadQueueUrl: !GetAtt GfeDbLoadQueue.QueueUrl Policies: - LambdaInvokePolicy: - FunctionName: !Ref ExecuteValidationQueriesFunction + FunctionName: !Ref GetExecutionStateFunction + - DynamoDBReadPolicy: + TableName: !Ref GfeDbExecutionStateTable + - DynamoDBWritePolicy: + TableName: !Ref GfeDbExecutionStateTable + - SNSPublishMessagePolicy: + TopicName: !GetAtt GfeDbExecutionResultTopic.TopicName - LambdaInvokePolicy: - FunctionName: !Ref InvokeLoadScriptFunction + FunctionName: !Ref ExecuteValidationQueriesFunction - LambdaInvokePolicy: FunctionName: !Ref InvokeBackupScriptFunction - LambdaInvokePolicy: FunctionName: !Ref ValidateBuildOutputFunction + - LambdaInvokePolicy: + FunctionName: !Ref EvaluateQueryResultsFunction + - LambdaInvokePolicy: + FunctionName: !Ref FormatResultsFunction - S3ReadPolicy: - BucketName: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}' + BucketName: !Sub "{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}" + - SQSSendMessagePolicy: + QueueName: !GetAtt GfeDbLoadQueue.QueueName - Version: "2012-10-17" Statement: - - Effect: "Allow" + - Effect: Allow Action: - - "logs:CreateLogDelivery" - - "logs:GetLogDelivery" - - "logs:UpdateLogDelivery" - - "logs:DeleteLogDelivery" - - "logs:ListLogDeliveries" - - "logs:PutResourcePolicy" - - "logs:DescribeResourcePolicies" - - "logs:DescribeLogGroups" - Resource: + - logs:CreateLogDelivery + - logs:GetLogDelivery + - logs:UpdateLogDelivery + - logs:DeleteLogDelivery + - logs:ListLogDeliveries + - logs:PutResourcePolicy + - logs:DescribeResourcePolicies + - logs:DescribeLogGroups + Resource: - "*" - Version: "2012-10-17" Statement: @@ -683,6 +961,7 @@ Resources: - "batch:SubmitJob" - "batch:DescribeJobs" - "batch:TerminateJob" + - "batch:TagResource" Resource: - !Sub 'arn:aws:batch:${AWS::Region}:${AWS::AccountId}:job-queue/${Stage}-${AppName}*' - !Sub 'arn:aws:batch:${AWS::Region}:${AWS::AccountId}:job-definition/${Stage}-${AppName}-*' @@ -693,103 +972,360 @@ Resources: - "events:DescribeRule" Resource: - !Sub 'arn:aws:events:${AWS::Region}:${AWS::AccountId}:rule/StepFunctionsGetEventsForBatchJobsRule' + LoadConcurrencyManagerStateMachine: + Type: AWS::Serverless::StateMachine + Properties: + DefinitionUri: statemachines/load-concurrency-manager.asl.json + DefinitionSubstitutions: + InvokeBackupScriptFunctionArn: !GetAtt InvokeBackupScriptFunction.Arn + LcmReceiveMessageFunctionArn: !GetAtt LcmReceiveMessageFunction.Arn + UpdatePipelineStateMachineExecutionAlarmName: !Ref UpdatePipelineStateMachineExecutionAlarm + # LoadReleaseActivityArn: !Ref LoadReleaseActivity + InvokeLoadScriptFunctionArn: !GetAtt InvokeLoadScriptFunction.Arn + GfeDbExecutionStateTable: !Ref GfeDbExecutionStateTable + GfeDbLoadQueueUrl: !GetAtt GfeDbLoadQueue.QueueUrl + Policies: + - LambdaInvokePolicy: + FunctionName: !Ref InvokeBackupScriptFunction + - LambdaInvokePolicy: + FunctionName: !Ref LcmReceiveMessageFunction + - SQSPollerPolicy: + QueueName: !GetAtt GfeDbLoadQueue.QueueName + - LambdaInvokePolicy: + FunctionName: !Ref InvokeLoadScriptFunction + - DynamoDBWritePolicy: + TableName: !Ref GfeDbExecutionStateTable + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - logs:CreateLogDelivery + - logs:GetLogDelivery + - logs:UpdateLogDelivery + - logs:DeleteLogDelivery + - logs:ListLogDeliveries + - logs:PutResourcePolicy + - logs:DescribeResourcePolicies + - logs:DescribeLogGroups + Resource: + - "*" + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - cloudwatch:DescribeAlarms + Resource: + - !GetAtt UpdatePipelineStateMachineExecutionAlarm.Arn + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ssm:GetCommandInvocation + Resource: + - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:*' + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - states:StopExecution + Resource: + - !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:execution:${UpdatePipelineStateMachine.Name}:*' Logging: - Destinations: - - CloudWatchLogsLogGroup: - LogGroupArn: !GetAtt UpdatePipelineLogGroup.Arn + Destinations: + - CloudWatchLogsLogGroup: + LogGroupArn: !GetAtt LoadConcurrencyManagerLogGroup.Arn IncludeExecutionData: true Level: ALL + LcmReceiveMessageFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: !Sub ${Stage}-${AppName}-lcm-receive-message + Description: !Sub Receive messages from the GfeDbLoadQueue for ${Stage}-${AppName} + CodeUri: functions/lcm_receive_message/ + Handler: app.lambda_handler + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 128 + Timeout: 60 + Architectures: + - x86_64 + Environment: + Variables: + APP_NAME: !Ref AppName + STAGE: !Ref Stage + Policies: + - AWSLambdaBasicExecutionRole + - SQSPollerPolicy: + QueueName: !GetAtt GfeDbLoadQueue.QueueName + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + InvokeLoadScriptFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: !Sub ${Stage}-${AppName}-invoke-load-script + Description: !Sub Invoke server-side loading on Neo4j for ${AppName} + CodeUri: functions/invoke_load_script/ + Handler: app.lambda_handler + Runtime: python3.10 + Layers: + - !Ref GfeDbModelsLayer + MemorySize: 256 + Timeout: 60 + Architectures: + - x86_64 + Environment: + Variables: + APP_NAME: !Ref AppName + STAGE: !Ref Stage + Policies: + - AWSLambdaBasicExecutionRole + - Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ssm:SendCommand + - ssm:GetDocument + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:document/${Neo4jLoadQueryDocument} + - !Sub 'arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jDatabaseInstanceId}}' + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* + GfeDbExecutionResultTopic: + Type: AWS::SNS::Topic + GfeDbExecutionResultTopicArnParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfeDbExecutionResultTopicArn + Description: !Sub ARN for ${AppName} Data Pipeline Errors SNS topic + Value: !Ref GfeDbExecutionResultTopic + # # These resources make it possible to fetch the notifications sent during Update Pipeline executions + # GfeDbExecutionResultDevQueue: + # Type: AWS::SQS::Queue + # UpdateReplacePolicy: Delete + # DeletionPolicy: Delete + # Properties: + # VisibilityTimeout: 30 + # MessageRetentionPeriod: 1209600 + # ReceiveMessageWaitTimeSeconds: 20 + # GfeDbExecutionResultTopicSubscription: + # Type: AWS::SNS::Subscription + # Properties: + # Endpoint: !GetAtt GfeDbExecutionResultDevQueue.Arn + # Protocol: sqs + # TopicArn: !Ref GfeDbExecutionResultTopic + # RawMessageDelivery: true + # GfeDbExecutionResultDevQueuePolicy: + # Type: AWS::SQS::QueuePolicy + # Properties: + # Queues: + # - !Ref GfeDbExecutionResultDevQueue + # PolicyDocument: + # Version: "2012-10-17" + # Statement: + # - Sid: AllowSNS + # Effect: Allow + # Principal: + # AWS: "*" + # Action: + # - sqs:SendMessage + # Resource: !GetAtt GfeDbExecutionResultDevQueue.Arn + # Condition: + # ArnEquals: + # aws:SourceArn: !Ref GfeDbExecutionResultTopic + + # TODO remove + # LoadReleaseActivity: + # Type: AWS::StepFunctions::Activity + # Properties: + # Name: !Sub ${AppName}-${Stage}-LoadReleaseActivity + # LoadReleaseActivityArnParameter: + # Type: AWS::SSM::Parameter + # Properties: + # Type: String + # Name: !Sub /${AppName}/${Stage}/${AWS::Region}/LoadReleaseActivityArn + # Description: ARN of gfe-db load release activity + # Value: !GetAtt LoadReleaseActivity.Arn - LoadNeo4jActivity: - Type: AWS::StepFunctions::Activity - Properties: - Name: !Sub "${AppName}-${Stage}-load-Neo4j" UpdatePipelineLogGroup: Type: AWS::Logs::LogGroup UpdateReplacePolicy: Delete DeletionPolicy: Delete Properties: - LogGroupName: !Sub "${Stage}-${AppName}-pipeline-execution-logs" - - UpdatePipelineArnParameter: + LogGroupName: !Sub ${Stage}-${AppName}-pipeline-execution-logs + RetentionInDays: 7 + UpdatePipelineStateMachineArnParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/UpdatePipelineArn' - Description: "ARN of gfe-db update pipeline state machine" + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/UpdatePipelineStateMachineArn + Description: ARN of gfe-db update pipeline state machine Value: !GetAtt UpdatePipelineStateMachine.Arn - - # CloudWatch Alarm for state machine execution in progress + UpdatePipelineErrorsTopicPolicy: + Type: AWS::SNS::TopicPolicy + Properties: + Topics: + - !Ref UpdatePipelineErrorsTopic + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: cloudwatch.amazonaws.com + Action: sns:Publish + Resource: !Ref UpdatePipelineErrorsTopic + Condition: + StringEquals: + aws:SourceAccount: !Ref AWS::AccountId + UpdatePipelineErrorsTopic: + Type: AWS::SNS::Topic + Properties: + DisplayName: !Sub "${AppName} Data Pipeline Errors" + UpdatePipelineErrorsTopicArnParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/UpdatePipelineErrorsTopicArn + Description: !Sub ARN of ${AppName} Update Pipeline Errors SNS topic + Value: !GetAtt UpdatePipelineErrorsTopic.TopicArn + UpdatePipelineExecutionTopic: + Type: AWS::SNS::Topic + Properties: + DisplayName: !Sub "${AppName} Data Pipeline Executions" + UpdatePipelineExecutionTopicPolicy: + Type: AWS::SNS::TopicPolicy + Properties: + Topics: + - !Ref UpdatePipelineExecutionTopic + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: cloudwatch.amazonaws.com + Action: sns:Publish + Resource: !Ref UpdatePipelineExecutionTopic + Condition: + StringEquals: + aws:SourceAccount: !Ref AWS::AccountId + UpdatePipelineExecutionTopicArnParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/UpdatePipelineExecutionTopicArn + Description: !Sub ARN of ${AppName} Update Pipeline Executions SNS topic + Value: !GetAtt UpdatePipelineExecutionTopic.TopicArn + UpdatePipelineExecutionTopicNameParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub "/${AppName}/${Stage}/${AWS::Region}/UpdatePipelineExecutionTopicName" + Description: !Sub "Name for ${AppName} Data Pipeline Errors SNS topic" + Value: !Ref UpdatePipelineExecutionTopic UpdatePipelineStateMachineExecutionAlarm: Type: AWS::CloudWatch::Alarm Properties: - AlarmDescription: !Sub '${UpdatePipelineStateMachine} state machine execution in progress' + AlarmDescription: !Sub ${UpdatePipelineStateMachine} state machine execution in progress ActionsEnabled: true AlarmActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineExecutionTopicArn}}' + - !GetAtt UpdatePipelineExecutionTopic.TopicArn OKActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineExecutionTopicArn}}' - Metrics: + - !GetAtt UpdatePipelineExecutionTopic.TopicArn + Metrics: - Id: m1 - MetricStat: - Metric: + MetricStat: + Metric: MetricName: ExecutionsStarted Namespace: AWS/States Dimensions: - Name: StateMachineArn Value: !GetAtt UpdatePipelineStateMachine.Arn - Period: 86400 + Period: 300 Stat: Sum Unit: Count ReturnData: false - Id: m2 - MetricStat: - Metric: + MetricStat: + Metric: MetricName: ExecutionsSucceeded Namespace: AWS/States Dimensions: - Name: StateMachineArn Value: !GetAtt UpdatePipelineStateMachine.Arn - Period: 86400 + Period: 300 Stat: Sum Unit: Count ReturnData: false - Id: m3 - MetricStat: - Metric: + MetricStat: + Metric: MetricName: ExecutionsFailed Namespace: AWS/States Dimensions: - Name: StateMachineArn Value: !GetAtt UpdatePipelineStateMachine.Arn - Period: 86400 + Period: 300 Stat: Sum Unit: Count ReturnData: false - Id: m4 - MetricStat: - Metric: + MetricStat: + Metric: MetricName: ExecutionsAborted Namespace: AWS/States Dimensions: - Name: StateMachineArn Value: !GetAtt UpdatePipelineStateMachine.Arn - Period: 86400 + Period: 300 Stat: Sum Unit: Count ReturnData: false - Id: e1 - Expression: "m1 - m2 - m3 - m4" + Expression: m1 - m2 - m3 - m4 Label: ExecutionsInProgress ReturnData: true ComparisonOperator: GreaterThanThreshold Threshold: 0 EvaluationPeriods: 1 + UpdatePipelineStateMachineExecutionAlarmNameParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/UpdatePipelineStateMachineExecutionAlarmName + Description: !Sub 'Name of ${Stage}-${AppName} UpdatePipeline state machine execution alarm' + Value: !Ref UpdatePipelineStateMachineExecutionAlarm + + + LoadConcurrencyManagerLogGroup: + Type: AWS::Logs::LogGroup + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Properties: + LogGroupName: !Sub ${Stage}-${AppName}-load-concurrency-manager-execution-logs + RetentionInDays: 7 + LoadConcurrencyManagerStateMachineArnParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/LoadConcurrencyManagerStateMachineArn + Description: !Sub 'ARN of ${Stage}-${AppName} load concurrency handler state machine' + Value: !GetAtt LoadConcurrencyManagerStateMachine.Arn DisableBackupFunction: Type: AWS::Serverless::Function Properties: - FunctionName: !Ref DisableBackupFunctionName + FunctionName: !Sub ${Stage}-${AppName}-disable-backup CodeUri: functions/disable_backup/ Handler: app.lambda_handler Runtime: python3.11 @@ -805,12 +1341,12 @@ Resources: - AWSLambdaBasicExecutionRole - Version: "2012-10-17" Statement: - - Effect: "Allow" - Action: - - "ssm:GetParameters" - - "ssm:GetParameter" - Resource: - - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/*' + - Effect: Allow + Action: + - ssm:GetParameters + - ssm:GetParameter + Resource: + - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AppName}/${Stage}/${AWS::Region}/* - Version: "2012-10-17" Statement: - Effect: "Allow" @@ -819,19 +1355,38 @@ Resources: Resource: - !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:maintenancewindow/{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/Neo4jBackupMaintenanceWindowId}}' Events: - PipelineExecutionTopic: + ExecutionsTrigger: Type: SNS Properties: - Topic: !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineExecutionTopicArn}}' + Topic: !GetAtt UpdatePipelineExecutionTopic.TopicArn + FormatResultsFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: !Sub ${Stage}-${AppName}-format-results + CodeUri: functions/format_results/ + Handler: app.lambda_handler + Runtime: python3.10 + MemorySize: 128 + Timeout: 60 + Architectures: + - x86_64 + Environment: + Variables: + STAGE: !Ref Stage + APP_NAME: !Ref AppName + Policies: + - AWSLambdaBasicExecutionRole + + + - # CloudWatch Alarm for failed pipeline executions UpdatePipelineStateMachineExecutionErrorsAlarm: Type: AWS::CloudWatch::Alarm Properties: - AlarmDescription: !Sub '${UpdatePipelineStateMachine} state machine errors' + AlarmDescription: !Sub ${UpdatePipelineStateMachine} state machine errors ActionsEnabled: true AlarmActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineErrorsTopicArn}}' + - !GetAtt UpdatePipelineErrorsTopic.TopicArn MetricName: ExecutionsFailed Namespace: AWS/States Statistic: Sum @@ -843,14 +1398,17 @@ Resources: - Name: StateMachineArn Value: !GetAtt UpdatePipelineStateMachine.Arn - # CloudWatch Alarm for failed pipeline integrations (Batch jobs) + + + + # Activates if the Batch job fails, not integrated with other services UpdatePipelineStateMachineIntegrationAlarm: Type: AWS::CloudWatch::Alarm Properties: - AlarmDescription: !Sub '${UpdatePipelineStateMachine} state machine errors' + AlarmDescription: !Sub ${UpdatePipelineStateMachine} state machine errors ActionsEnabled: true AlarmActions: - - !Sub '{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataPipelineErrorsTopicArn}}' + - !GetAtt UpdatePipelineErrorsTopic.TopicArn MetricName: ServiceIntegrationsFailed Namespace: AWS/States Statistic: Sum @@ -860,74 +1418,92 @@ Resources: ComparisonOperator: GreaterThanOrEqualToThreshold Dimensions: - Name: ServiceIntegrationResourceArn - Value: !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:batch:submitJob.sync' + Value: !Sub arn:aws:states:${AWS::Region}:${AWS::AccountId}:batch:submitJob.sync - # FailedAllelesQueue: - # Type: AWS::SQS::Queue - # Properties: - # VisibilityTimeout: 20 - # RedrivePolicy: - # deadLetterTargetArn: !GetAtt FailedAllelesDeadLetterQueue.Arn - # maxReceiveCount: 5 - # FailedAllelesDeadLetterQueue: - # Type: AWS::SQS::Queue - Neo4jLoadQueryDocument: + + + + # TODO move to database stack + Neo4jLoadQueryDocument: Type: AWS::SSM::Document Properties: - DocumentType: "Command" - DocumentFormat: "YAML" - TargetType: "/AWS::EC2::Instance" + DocumentType: Command + DocumentFormat: YAML + TargetType: /AWS::EC2::Instance Content: schemaVersion: "2.2" - description: !Sub "Load Neo4j for ${AppName}" + description: !Sub Load Neo4j for ${AppName} parameters: sourceType: - type: "String" - description: "S3" - default: "S3" + type: String + description: S3 + default: S3 sourceInfo: - type: "StringMap" - description: !Sub "Downloads all files under the ${AppName} scripts prefix" + type: StringMap + description: !Sub Downloads all files under the ${AppName} scripts prefix default: - path: !Sub 'https://{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}.s3.amazonaws.com/config/scripts/' - commandLine: - type: "String" - description: "These commands are invoked by a Lambda script which sets the correct parameters (Refer to documentation)." - default: 'make neo4j.start && bash start_task.sh' + path: !Sub https://{{resolve:ssm:/${AppName}/${Stage}/${AWS::Region}/DataBucketName}}.s3.amazonaws.com/${ConfigS3Path}/database/scripts/ workingDirectory: type: "String" description: "Working directory" default: "/home/ec2-user" executionTimeout: - type: "String" - description: "(Optional) The time in seconds for a command to complete before it is considered to have failed. Default is 3600 (1 hour). Maximum is 28800 (8 hours)." + type: String + description: (Optional) The time in seconds for a command to complete before it is considered to have failed. Default is 3600 (1 hour). Maximum is 28800 (8 hours). default: "28800" + LoadEvent: + type: String + description: "JSON string containing load event details" mainSteps: - - action: "aws:downloadContent" - name: "downloadContent" + - action: aws:downloadContent + name: downloadContent inputs: sourceType: "{{ sourceType }}" sourceInfo: "{{ sourceInfo }}" destinationPath: "{{ workingDirectory }}" - - action: "aws:runShellScript" - name: "runShellScript" - inputs: + - action: aws:runShellScript + name: runShellScript + inputs: runCommand: - "" - - "directory=$(pwd)" - - "export PATH=$PATH:$directory" - - " {{ commandLine }} " + - directory=$(pwd) + - export PATH=$PATH:$directory + - 'echo ''{{ LoadEvent }}'' > load_event.json' + - 'make neo4j.start && bash start_task.sh "$(cat load_event.json)"' - "" workingDirectory: "{{ workingDirectory }}" timeoutSeconds: "{{ executionTimeout }}" - Neo4jLoadQueryDocumentNameParameter: Type: AWS::SSM::Parameter Properties: Type: String - Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Neo4jLoadQueryDocumentName' - Description: "Name of SSM document for loading Neo4j" - Value: !Ref Neo4jLoadQueryDocument - - \ No newline at end of file + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/Neo4jLoadQueryDocumentName + Description: Name of SSM document for loading Neo4j + Value: !Ref Neo4jLoadQueryDocument + GfedbPipelineParamMappingsParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub /${AppName}/${Stage}/${AWS::Region}/GfedbPipelineParamMappings + Description: !Sub SSM Parameter and SecretsManager parameter paths for ${AppName} pipeline layer + Tier: Standard + Value: !Sub | + { + "ssm": [ + "/${AppName}/${Stage}/${AWS::Region}/GitHubSourceRepository", + "/${AppName}/${Stage}/${AWS::Region}/GfeDbExecutionStateTableName", + "/${AppName}/${Stage}/${AWS::Region}/GfeDbExecutionStateTableFields", + "/${AppName}/${Stage}/${AWS::Region}/BuildJobQueueArn", + "/${AppName}/${Stage}/${AWS::Region}/BuildServiceRepositoryName", + "/${AppName}/${Stage}/${AWS::Region}/GfeDbProcessingQueueUrl", + "/${AppName}/${Stage}/${AWS::Region}/GfeDbLoadQueueUrl", + "/${AppName}/${Stage}/${AWS::Region}/UpdatePipelineStateMachineArn", + "/${AppName}/${Stage}/${AWS::Region}/LoadConcurrencyManagerStateMachineArn", + "/${AppName}/${Stage}/${AWS::Region}/Neo4jLoadQueryDocumentName", + "/${AppName}/${Stage}/${AWS::Region}/DatabaseSyncScriptsDocumentName" + ], + "secretsmanager": [ + "/${AppName}/${Stage}/${AWS::Region}/GitHubPersonalAccessToken" + ] + } diff --git a/notebooks/0.0-github-api-commits-by-branch-eda.ipynb b/notebooks/0.0-github-api-commits-by-branch-eda.ipynb new file mode 100644 index 00000000..5b729274 --- /dev/null +++ b/notebooks/0.0-github-api-commits-by-branch-eda.ipynb @@ -0,0 +1,815 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub API EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/8s/9sb2nsn913q7b4zz75fd_qf00000gn/T/ipykernel_1653/1475131825.py:8: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv, find_dotenv\n", + "load_dotenv('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.env.nmdpf');\n", + "from itertools import chain, starmap\n", + "from pathlib import Path\n", + "import json\n", + "import requests\n", + "import pandas as pd\n", + "\n", + "# Pandas display options\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Environment variables\n", + "AWS_REGION = os.environ[\"AWS_REGION\"] \n", + "GITHUB_PERSONAL_ACCESS_TOKEN = os.environ[\"GITHUB_PERSONAL_ACCESS_TOKEN\"]\n", + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "root_dir = Path('.').resolve().parent" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def flatten_json(dictionary, sep='.', skip_fields=[]):\n", + " \"\"\"Flatten a nested json file. For a list of dictionaries, use this\n", + " inside a for loop before converting to pandas DataFrame.\"\"\"\n", + "\n", + " def unpack(parent_key, parent_value):\n", + " \"\"\"Unpack one level of nesting in json file\"\"\"\n", + " # Unpack one level only!!!\n", + " \n", + " if isinstance(parent_value, dict):\n", + " for key, value in parent_value.items():\n", + " temp1 = parent_key + sep + key\n", + " yield temp1, value\n", + " elif isinstance(parent_value, list):\n", + " i = 0 \n", + " for value in parent_value:\n", + " temp2 = parent_key + sep +str(i) \n", + " i += 1\n", + " yield temp2, value\n", + " else:\n", + " yield parent_key, parent_value \n", + "\n", + "\n", + " # Keep iterating until the termination condition is satisfied\n", + " while True:\n", + " # Keep unpacking the json file until all values are atomic elements (not dictionary or list)\n", + " dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))\n", + " # Terminate condition: not any value in the json file is dictionary or list\n", + " if not any(isinstance(value, dict) for value in dictionary.values()) and \\\n", + " not any(isinstance(value, list) for value in dictionary.values()):\n", + " break\n", + "\n", + " return dictionary\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_commits(owner, repo, per_page=100):\n", + " \"\"\"Return a list of GitHub commits for the specified repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/commits?per_page={per_page}'\n", + "\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_commit(owner, repo, commit_sha):\n", + " \"\"\"Return the commit for the specified repository and commit SHA\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/commits/{commit_sha}'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_branches(owner, repo):\n", + " \"\"\"Fetch branches for a GitHub repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/branches'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + " branches = response.json()\n", + "\n", + " return branches" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def get_branch(owner, repo, branch_name):\n", + " \"\"\"Fetch branches for a GitHub repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/branches/{branch_name}'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + " branches = response.json()\n", + "\n", + " return branches" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to fetch pull requests\n", + "def fetch_pull_requests(owner, repo):\n", + " url = f\"https://api.github.com/repos/{owner}/{repo}/pulls?state=all\"\n", + " \n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " if response.status_code == 200:\n", + " return response.json()\n", + " else:\n", + " print(f\"Error: {response.status_code}\")\n", + " return []" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Commits by Branch\n", + "This data was previously downloaded as a JSON file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from pygethub import list_branches, list_commits, GitHubPaginator" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)\n", + "\n", + "# # BRANCHES\n", + "# branch_pages = paginator.get_paginator(list_branches, owner=GITHUB_REPOSITORY_OWNER, repo=GITHUB_REPOSITORY_NAME)\n", + "# all_branches = list(branch_pages)\n", + "\n", + "# # TODO 2/10/24\n", + "# # TODO extract the branch names\n", + "# branch_names = [branch[\"name\"] for branch in all_branches]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# commits_by_branch = {}\n", + "# for branch in branch_names:\n", + "# list_commits_params = {\n", + "# \"owner\": GITHUB_REPOSITORY_OWNER,\n", + "# \"repo\": GITHUB_REPOSITORY_NAME,\n", + "# \"sha\": branch,\n", + "# }\n", + "# branch_commit_pages = paginator.get_paginator(\n", + "# list_commits, \n", + "# **list_commits_params,\n", + "# user_agent=\"nmdp-bioinformatics-gfe-db-state-builder/1.0\")\n", + "# commits_by_branch[branch] = list(branch_commit_pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# with open(root_dir / \"commits-by-branch.json\", \"w\") as f:\n", + "# json.dump(commits_by_branch, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# load commits-by-branch.json\n", + "with open(root_dir / \"commits-by-branch.json\", \"r\") as f:\n", + " commits_by_branch = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commits by Branch EDA\n", + "\n", + "Reshape commits_by_branch JSON using dict and list comprehensions\n", + "Example of input structure\n", + "```json\n", + "{\n", + " \"300\": [\n", + " {\n", + " \"sha\": \"ba5cb3d05c7b3ba5024cdafa192d89af186f08a9\",\n", + " \"node_id\": \"MDY6Q29tbWl0MjQ1NDAxMzY6YmE1Y2IzZDA1YzdiM2JhNTAyNGNkYWZhMTkyZDg5YWYxODZmMDhhOQ==\",\n", + " \"commit\": {\n", + " \"author\": {\n", + " \"name\": \"anhig\",\n", + " \"email\": \"james.robinson@anthonynolan.org\",\n", + " \"date\": \"2017-06-07T13:49:28Z\"\n", + " },\n", + " \"committer\": {\n", + " \"name\": \"anhig\",\n", + " \"email\": \"james.robinson@anthonynolan.org\",\n", + " \"date\": \"2017-06-07T13:49:28Z\"\n", + " },\n", + " \"message\": \"Addition of historical WMDA files\\n\\nAddition of historical WMDA files\",\n", + " \"tree\": {\n", + " \"sha\": \"9eafc92b0944c5e08f7c4b9faeb75c491d293a8a\",\n", + " \"url\": \"https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/9eafc92b0944c5e08f7c4b9faeb75c491d293a8a\"\n", + " },\n", + " \"url\": \"https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/ba5cb3d05c7b3ba5024cdafa192d89af186f08a9\",\n", + " \"comment_count\": 0,\n", + " \"verification\": {\n", + " \"verified\": false,\n", + " \"reason\": \"unsigned\",\n", + " \"signature\": null,\n", + " \"payload\": null\n", + " }\n", + " }\n", + " },\n", + " ...\n", + " ],\n", + " ...\n", + "}\n", + "```\n", + "Example of output structure\n", + "```json\n", + "[\n", + " {\n", + " \"branch\": \"300\",\n", + " \"sha\": \"ba5cb3d05c7b3ba5024cdafa192d89af186f08a9\",\n", + " \"node_id\": \"MDY6Q29tbWl0MjQ1NDAxMzY6YmE1Y2IzZDA1YzdiM2JhNTAyNGNkYWZhMTkyZDg5YWYxODZmMDhhOQ==\",\n", + " \"commit\": {\n", + " \"author\": {\n", + " \"name\": \"anhig\",\n", + " \"email\": \"james.robinson@anthonynolan.org\",\n", + " \"date\": \"2017-06-07T13:49:28Z\"\n", + " },\n", + " \"committer\": {\n", + " \"name\": \"anhig\",\n", + " \"email\": \"james.robinson@anthonynolan.org\",\n", + " \"date\": \"2017-06-07T13:49:28Z\"\n", + " },\n", + " \"message\": \"Addition of historical WMDA files\\n\\nAddition of historical WMDA files\",\n", + " \"tree\": {\n", + " \"sha\": \"9eafc92b0944c5e08f7c4b9faeb75c491d293a8a\",\n", + " \"url\": \"https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/9eafc92b0944c5e08f7c4b9faeb75c491d293a8a\"\n", + " },\n", + " \"url\": \"https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/ba5cb3d05c7b3ba5024cdafa192d89af186f08a9\",\n", + " \"comment_count\": 0,\n", + " \"verification\": {\n", + " \"verified\": false,\n", + " \"reason\": \"unsigned\",\n", + " \"signature\": null,\n", + " \"payload\": null\n", + " }\n", + " }\n", + " },\n", + " ...\n", + "]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "commits_by_branch_list = [\n", + " {\n", + " \"branch\": branch,\n", + " **commit\n", + " }\n", + " for branch, commits in commits_by_branch.items()\n", + " for commit in commits\n", + "]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# flatten the commits_by_branch_list\n", + "commits_by_branch_list_flat = [flatten_json(commit) for commit in commits_by_branch_list]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# load to pandas DataFrame\n", + "commits_by_branch_df = pd.DataFrame(commits_by_branch_list_flat)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The aggregation below shows that a single sha can be associated with multiple branches. This means that we cannot rely on the branch name to indicate the release version the commit was made for.\n", + "\n", + "Also notice that some commits are associated with only one branch. These are the commits that were missing from calling list_commits which defaults to the master branch, which in this case is called 'Latest'." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# group by sha and find unique branches, then include a column for the number of branches and sort from least to most\n", + "commits_by_sha = commits_by_branch_df.groupby(\"sha\").agg(\n", + " branches=(\"branch\", \"unique\"),\n", + " date=(\"commit.author.date\", \"first\"),\n", + " num_branches=(\"branch\", \"nunique\"),\n", + " # html_url=(\"html_url\", \"first\"),\n", + ").sort_values(\"date\").reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# commits_by_sha" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shabranchesdatenum_branches
270e1cd1ec3e66f4ab2b218f6758ed315f557778655[3130]2017-06-21T14:40:46Z1
\n", + "
" + ], + "text/plain": [ + " sha branches date \\\n", + "270 e1cd1ec3e66f4ab2b218f6758ed315f557778655 [3130] 2017-06-21T14:40:46Z \n", + "\n", + " num_branches \n", + "270 1 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_by_sha[commits_by_sha[\"sha\"] == \"e1cd1ec3e66f4ab2b218f6758ed315f557778655\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "739" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(commits_by_sha)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commits by Branch Processing\n", + "Reduce commits-by-branch JSON to a list of unique commits." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an array of all unique commits in commits_by_branch and omit the branch information\n", + "unique_commits = set()\n", + "for release, commits in commits_by_branch.items():\n", + " unique_commits.update([json.dumps(commit) for commit in commits])\n", + "\n", + "# covert back to dict\n", + "unique_commits = [json.loads(commit) for commit in unique_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "739" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(unique_commits)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "unique_commits_flat = [flatten_json(commit) for commit in unique_commits]\n", + "unique_commits_df = pd.DataFrame(unique_commits_flat).sort_values(\"commit.author.date\").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "739" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort by date\n", + "len(unique_commits_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# unique_commits_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# unique_commits_df[['sha', 'commit.author.date', 'commit.message', 'html_url']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling Error SHAs\n", + "Some SHAs do not allow files to be retrieved using the GitHub REST API:\n", + "```json\n", + "[\n", + " \"8d77b3dd93959663d58ae5b626289d0746edd0e7\",\n", + " \"252d7c5dc9d2f7671447fd11fe6bb004c438f34b\",\n", + " \"e1cd1ec3e66f4ab2b218f6758ed315f557778655\",\n", + " \"fa208da83a7f96d62c1e4efee2018074bbd805e0\",\n", + " \"09ed08b9abcd97622d59ec37e31b4706dc9a9391\",\n", + " \"8db938b1eb58dd8c77cba9b7524f84cf8ffe719c\",\n", + " \"041318439bf0ba291f990faaa27cd6ad0a062d13\",\n", + " \"ba5cb3d05c7b3ba5024cdafa192d89af186f08a9\",\n", + " \"7ca4eb239a96884142d3ef0b0182d3bc84ec1bba\",\n", + " \"3abe7e12dcbc3824315959af4428c53bd760c6e7\",\n", + " \"c4d3f67ef7ef4b5f6571b4f1d4aa5b928d2a3d56\",\n", + " \"23044ee80c27f75bb34c9f9ac689b1c68cd65914\"\n", + "]\n", + "```\n", + "\n", + "In this case version 300 is still missing." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Most Recent Commit by Branch\n", + "Evaluating API responses with the objective of finding the most recent data for a given release.\n", + "- All releases are available as branches" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page 1: 57 items\n" + ] + } + ], + "source": [ + "paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)\n", + "\n", + "### COMMITS BY BRANCHES ###\n", + "branch_pages = paginator.get_paginator(\n", + " list_branches, \n", + " owner=GITHUB_REPOSITORY_OWNER, \n", + " repo=GITHUB_REPOSITORY_NAME,\n", + " user_agent=\"nmdp-bioinformatics-gfe-db-state-builder/1.0\"\n", + ")\n", + "all_branches = list(branch_pages)\n", + "\n", + "# # extract the branch names\n", + "# branch_names = [branch[\"name\"] for branch in all_branches]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': '3550',\n", + " 'commit': {'sha': 'e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb'},\n", + " 'protected': False}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_branches[-2]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': '3520',\n", + " 'commit': {'sha': '62945381d236dcdb2770daf1fa861b216b99635c',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/62945381d236dcdb2770daf1fa861b216b99635c'},\n", + " 'protected': False},\n", + " {'name': '3530',\n", + " 'commit': {'sha': '83aa94b540407ccdfcb452c77439b86c543f849d',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/83aa94b540407ccdfcb452c77439b86c543f849d'},\n", + " 'protected': False},\n", + " {'name': '3540',\n", + " 'commit': {'sha': '7d00d7b49cbcc987e07752845bd8b14986316ab4',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/7d00d7b49cbcc987e07752845bd8b14986316ab4'},\n", + " 'protected': False},\n", + " {'name': '3550',\n", + " 'commit': {'sha': 'e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb'},\n", + " 'protected': False},\n", + " {'name': 'Latest',\n", + " 'commit': {'sha': 'df6ba6f80a2c5f999590f06fced6c4c4ff56b89d',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/df6ba6f80a2c5f999590f06fced6c4c4ff56b89d'},\n", + " 'protected': False}]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_branches[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# write json to file for all-branches\n", + "with open(root_dir / \"all-branches.json\", \"w\") as f:\n", + " json.dump(all_branches, f, indent=4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/0.0-process-updated-releases.ipynb b/notebooks/0.0-process-updated-releases.ipynb new file mode 100644 index 00000000..9d3aa4db --- /dev/null +++ b/notebooks/0.0-process-updated-releases.ipynb @@ -0,0 +1,968 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub API EDA\n", + "\n", + "Application state variables:\n", + "```json\n", + "{\n", + " \"current_release\": \"3510\",\n", + " \"last_processed_commit\": {\n", + " \"sha\": \"5f2c562056f8ffa89aeea0631f2a52300ee0de17\",\n", + " \"date\": \"2023-01-13T10:04:48Z\"\n", + " },\n", + " \"tracked_assets\": [\n", + " \"hla.dat\",\n", + " \"msf/\"\n", + " ]\n", + "}\n", + "```\n", + "\n", + "## Strategy\n", + "### 1. App State\n", + "* Fetch app state from S3\n", + "* Update app state at end of execution\n", + "### 2. Asset Processing\n", + "* For each tracked asset:\n", + " * Fetch the commits for the asset\n", + " * Filter by the last processed commit date\n", + "* Merge the commits for each asset into a single list → array of commits\n", + "* If array is empty (no commits found), exit\n", + "* If array is not empty (commits are found)\n", + " * Get the release version for each commit ← needs strategy\n", + " * Build the release" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')\n", + "import logging\n", + "from datetime import datetime\n", + "import re\n", + "import json\n", + "import pandas as pd\n", + "from src.utils import (\n", + " load_state,\n", + " get_commits_for_asset,\n", + " get_repo_asset,\n", + " get_commits,\n", + " flatten_json\n", + ")\n", + "\n", + "# logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Pandas display options\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### App State" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "app_state = {\n", + " \"last_processed_release\": {\n", + " \"release\": 3510,\n", + " \"sha\": \"ecd63776c6225af0cf8bcc9fa9c6998d3129fb14\",\n", + " \"date\": \"2022-04-14T11:00:42Z\",\n", + " \"status\": \"SUCCESS\"\n", + " },\n", + " \"tracked_assets\": [\n", + " \"hla.dat\",\n", + " \"msf/\"\n", + " ]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "state = load_state(app_state)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Asset Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "asset_commits = []\n", + "for asset in state['tracked_assets']:\n", + " commits = get_commits_for_asset(\n", + " owner=GITHUB_REPOSITORY_OWNER,\n", + " repo=GITHUB_REPOSITORY_NAME,\n", + " path=asset,\n", + " since=state['last_processed_commit']['date']\n", + " )\n", + " asset_commits.extend(commits)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out commits before last processed commit\n", + "unique_shas = list(set([(item[\"sha\"], item[\"commit\"][\"author\"][\"date\"]) for item in asset_commits \\\n", + " if datetime.strptime(item[\"commit\"][\"author\"][\"date\"], \"%Y-%m-%dT%H:%M:%SZ\") > state['last_processed_commit']['date']]))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),\n", + " ('2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " ('72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " ('2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),\n", + " ('8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " ('1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " ('4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z')]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_shas" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Release versions by sha[[\"def376dc6955b339b17f0a4b840e80eb6b9c744b\", 3520], [\"4486f5c623705c6a14d9eeaba7d155cff30cdb43\", 3510], [\"2d38d3313229fdc5f8aa00052a2db21b35be3d2d\", 3500], [\"2c631a4b61d529ff1c0635750888f6f6d79c2703\", 3500], [\"8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\", 3490], [\"1a3be9a5d01a414854ff3bfacd5257c14adeefa2\", 3490], [\"72a9e28a52c9629dd63dfad5f215cdc562e2fd7e\", 3480]]\n", + "INFO:__main__:Unique releases:\n", + " [3520, 3490, 3500, 3510, 3480]\n" + ] + } + ], + "source": [ + "# get the releases for each unique commit from Allelelist.txt\n", + "# can produce duplicate release versions if the same release is updated more than once\n", + "# makes the assumption that the release version branch is up to date for that release, since the build process targets the release version branch and not the specific commit sha\n", + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "release_versions = []\n", + "for sha, _ in unique_shas:\n", + " allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", sha)\n", + " release_version = int(re.search(release_version_re, allele_list).group(1).replace(\".\", \"\"))\n", + " release_versions.append((sha, release_version))\n", + "\n", + "release_versions.sort(key=lambda x: x[1], reverse=False)\n", + "logger.info(f\"Release versions by sha:\\n{json.dumps(release_versions)}\")\n", + "\n", + "unique_release_versions = list(set([version[1] for version in release_versions]))\n", + "logger.info(f\"Unique releases:\\n {json.dumps(unique_release_versions)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[3520, 3490, 3500, 3510, 3480]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# send these to the state machine\n", + "unique_release_versions" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# sort commits by most recent date on [].commit.author.date\n", + "asset_commits.sort(key=lambda x: x[\"commit\"][\"author\"][\"date\"], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "all_commits = get_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "select_keys = [\"sha\", \"commit\"]\n", + "\n", + "# filter by select_keys\n", + "all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[[\"sha\", \"commit.author.date\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shacommit.author.date
038398a75e9762ff070d8e9bd714d074332646cd72023-04-17T16:03:52Z
1def376dc6955b339b17f0a4b840e80eb6b9c744b2023-04-17T16:01:01Z
25f2c562056f8ffa89aeea0631f2a52300ee0de172023-01-13T10:04:48Z
34b8432c7d56121c84d6ef1d75a1c7185c628c13d2023-01-12T14:47:00Z
44486f5c623705c6a14d9eeaba7d155cff30cdb432023-01-12T14:36:43Z
550b790037030d958b662085c3f4cf34ba72a32ec2022-12-14T10:02:54Z
636220a1c5c2d6954f4873a552544cc0e55b61d0a2022-12-14T10:02:28Z
7e941759874365cb152a3562c22d10847d10db3262022-10-14T08:47:37Z
82d38d3313229fdc5f8aa00052a2db21b35be3d2d2022-10-14T08:46:01Z
91ce31fc9e2805034578eff60a269c02176f032522022-10-13T13:06:12Z
102c631a4b61d529ff1c0635750888f6f6d79c27032022-10-13T12:58:37Z
1112b4b94bb49b4640fa48a1182cc9b1e1fbf7e8162022-08-25T15:44:07Z
128f80f24d49797595d8a18b8d4d1f59846fbf3fe12022-08-25T15:43:12Z
13a9680a9e80e2c119d4aa41f90e5f40cef6e7df022022-08-21T09:31:21Z
141fd937e6c0ff8452f64152aec9632f0586f981d12022-08-21T09:29:01Z
1550e92c677f4cd547c32bb9305269a762a95950742022-07-23T12:54:16Z
168732e7d4739a911e01f69d2c5bda82ca15ca68a92022-07-23T12:53:45Z
17536e8833f3be4bbcffbfba43cd0b3043c5bf40682022-07-15T08:21:31Z
18c90cc62006b35061c8db4b6f8da1b86de7650b232022-07-15T08:20:46Z
191a3be9a5d01a414854ff3bfacd5257c14adeefa22022-07-14T13:40:17Z
204863b8f0a070d70836dfbdc00bdf70aa7bb663452022-07-14T12:31:36Z
2172a9e28a52c9629dd63dfad5f215cdc562e2fd7e2022-07-14T12:28:41Z
22dddc6afc29de895f4131693e17b3cc856ff23f8d2022-07-14T12:27:25Z
23d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe2022-04-20T08:39:00Z
244d33035dd296826bbde200e17e5281910ac8be132022-04-20T08:37:44Z
254a13df461c42f970a099de77377f309995e7995c2022-04-19T08:30:24Z
265e1d9764c8e96749bc11d52807eea1f7cc38ce5c2022-04-14T11:01:59Z
27ecd63776c6225af0cf8bcc9fa9c6998d3129fb142022-04-14T11:00:42Z
28cada41a6bfac5a8bf88ed2107a0b856b9b9785a02022-03-15T14:27:59Z
29db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d2022-03-15T14:27:31Z
30207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e2022-02-10T15:36:59Z
318ab272288e81fa9a49fd4765579b066c2c03cc102022-02-10T15:26:19Z
32d4c410cd91618a8a74a7763598e499f6a8aa168b2022-01-14T15:25:23Z
33db46d2a0209720c5fa0ab8b03403f7009f69d2d62022-01-14T12:27:52Z
34ebb3d8c6aa383567bb7794cacf745b1c82efbb7f2022-01-14T12:27:35Z
35fc712c5a4dd00f534e845982a29f46a14e22b2922022-01-14T08:05:58Z
36239bf68f403bbdbabb43fa0277040f4dda2b2aff2022-01-13T20:33:38Z
37d86bdd9983424c792691269fa10e6f022ccf21912022-01-13T16:53:03Z
3806ceff14b2db920d458dc337b1100dced992e6272022-01-13T16:52:15Z
393fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd2022-01-04T09:19:02Z
400a051a92e0baab68b4c77ad13b8cb4e7eaa2f8102022-01-04T09:18:33Z
41d76ca7d75891aac95205b2cc11510ce8ef41a65a2021-11-11T11:49:26Z
42178c68cca2044cbd5032cbff23e4c06b738aed2a2021-11-11T11:48:57Z
432d19daf42ff47c2f2c4e7970dd77ed32768eb38f2021-10-26T11:32:30Z
443f46491c0fe377fd922412c93137efb6a4879b052021-10-26T11:15:21Z
458dd2a04b1d7e559f5124d012f0a729d0180216d22021-10-11T15:33:46Z
46ddda708da9f2f41ae7e7d122973c2eddb141d9492021-10-11T15:29:53Z
478d77b3dd93959663d58ae5b626289d0746edd0e72021-10-11T15:13:57Z
48d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea2021-10-11T14:48:05Z
49389b55b1c3ef58ea1136fb0c9e6d2ce73038d6552021-10-11T13:16:09Z
505088650c144e6a3ee05ac1e015d487a86095e6652021-10-11T12:53:06Z
517dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd2021-09-17T15:49:56Z
52352813601073bcdc3d5eb08d1be07be904276a9a2021-09-17T15:49:21Z
536c613ce701353f42b6c911801f0aeb78413fac9d2021-08-20T15:25:29Z
548f892b2675dd6464f526d32f1ad1d3fd62fa3b032021-08-20T14:12:35Z
55a231c3b065a7b9c86c92915f3b4bcf012ec361752021-07-20T11:13:25Z
561a3007e433ed7ccffc4a2c7c36ed8013d6105d4d2021-07-20T10:34:46Z
574a8d93e8607e2413edee71da68cade6e11249fea2021-07-19T11:27:37Z
58d3fd19b04d4256600e8c90a27a2b317d61514c862021-07-19T11:23:30Z
59fd63f824a11eead1f7fb39457251c7ac27ad1f932021-07-15T08:07:22Z
6089b57278869c7b46fbbaf94825ed336328f3567a2021-07-15T07:59:10Z
616fdf7add8bb4f416bd8660e6b826d80e8759dfe12021-07-14T06:19:03Z
62b57de92d58da2bf787e5af34ab09e0e8ee6cea6c2021-07-13T16:04:39Z
634cdd02827d4b0d2d8834e45baaea621663e0d6322021-07-13T16:03:03Z
64e6a3f0cc5710495475e7b959ee82c649978843f82021-07-13T16:01:36Z
65487c16cf1fb4ecb7eca25d752cb7178f10232a812021-07-13T15:56:41Z
663d7382f653a9707c3b2d683e7d58b7b9d0b4e7472021-07-13T15:55:05Z
67d155ba8553d2c53580ef66fbf733484dc2857a302021-07-13T15:52:43Z
68af805f6d40beef639fac7bd1c5f7789893c2f6622021-06-15T10:31:53Z
69ee9eefb21f878da18c72bdc04b31d6ec5d8ade062021-06-15T10:26:05Z
70a736a7ce4d98a0fefa3f4c01ab6da4641de742542021-06-11T12:09:09Z
712f6878301950dfea48a8147a6d8bd012be5640632021-06-11T12:09:02Z
72c0510fbaa138ba0029d8aa2949c282fb2dfbf9262021-06-11T10:16:30Z
734871325c65b16d5fc1a0fb406ebf539c3cbcf0542021-05-12T10:36:19Z
74cf8ed2bb499fae21b7ed1544177ef3589a8246b12021-05-12T10:34:43Z
75b9d97ef45c56394528534bd1a2ec7d2d785233542021-04-26T14:25:47Z
768da1eca28db8f6ae0db7e7134d334a10eb96f6a32021-04-21T11:17:00Z
77a2726dfb0cde8a5d567ce9a115d4f254f2af38512021-04-21T11:16:49Z
78410e5b8bd76c20bcde2cd0e92c78713e00160c282021-04-21T10:36:48Z
794240ef4cf17e503c44b177ff0f7b740258ac2e702021-04-12T15:17:44Z
807b746d5303493fb53214ceffbd9b365c8ef4a1262021-04-01T10:04:53Z
8194cfe2ac15af0171f1397984ca0d1968a6931eb32021-03-03T08:53:38Z
82c37267e1ee355c619a3058da5a7135e19e6522de2021-03-03T08:44:58Z
83d531c3422f806634d35d611b0296b6a34815c4e72021-02-26T12:30:20Z
840fa36435691cac5651bee2ffe511f73ccd98c3672021-02-17T15:20:04Z
85a65fcca04e51ba8a9d48d2ee676be1de7ff017462021-02-17T14:31:37Z
863182e0793150de4ffb34da2774991cb24e49a1912021-02-04T13:05:18Z
875df8b7ecec0e249df5485dd622efed8b357585402021-02-04T11:29:37Z
880fe7efb1a6217accb8d53cbb17a633f406459fbb2021-02-03T16:38:34Z
892fdc68adf24ec02e28fba5657c7f31c1373491d22021-02-03T16:11:39Z
90cf7eb4a9781fbdac6dba9ff7b345db2d88748eea2021-02-03T16:10:25Z
916305b6c83d61553d3606905f7b41a0cb99f92f0d2021-01-19T16:57:57Z
9225c04ea1f8bc4a9915417998310a563bdbb481152021-01-19T16:56:43Z
93cee977d8d1ca2d1f03f384e53227d56e5143b6fb2021-01-19T11:56:18Z
9403ded64be9b290287ab0101891ca6f338861c2fe2021-01-19T11:55:23Z
954ec1022d2645f3c2e5018971473710d568078eae2021-01-18T16:31:24Z
967a7d58ec828e8295dd4011f4f460f961a2b4428a2021-01-18T15:04:52Z
9767c8be545d4483ac34f86fdd57f09135a4820be62021-01-18T15:02:25Z
98441eeff3a9fca4934fc9cf54e2e4a914bb43ef102021-01-18T14:51:08Z
99a02389393dbd06e2e936d7d2438ef7e48fb969d92020-12-16T11:39:46Z
\n", + "
" + ], + "text/plain": [ + " sha commit.author.date\n", + "0 38398a75e9762ff070d8e9bd714d074332646cd7 2023-04-17T16:03:52Z\n", + "1 def376dc6955b339b17f0a4b840e80eb6b9c744b 2023-04-17T16:01:01Z\n", + "2 5f2c562056f8ffa89aeea0631f2a52300ee0de17 2023-01-13T10:04:48Z\n", + "3 4b8432c7d56121c84d6ef1d75a1c7185c628c13d 2023-01-12T14:47:00Z\n", + "4 4486f5c623705c6a14d9eeaba7d155cff30cdb43 2023-01-12T14:36:43Z\n", + "5 50b790037030d958b662085c3f4cf34ba72a32ec 2022-12-14T10:02:54Z\n", + "6 36220a1c5c2d6954f4873a552544cc0e55b61d0a 2022-12-14T10:02:28Z\n", + "7 e941759874365cb152a3562c22d10847d10db326 2022-10-14T08:47:37Z\n", + "8 2d38d3313229fdc5f8aa00052a2db21b35be3d2d 2022-10-14T08:46:01Z\n", + "9 1ce31fc9e2805034578eff60a269c02176f03252 2022-10-13T13:06:12Z\n", + "10 2c631a4b61d529ff1c0635750888f6f6d79c2703 2022-10-13T12:58:37Z\n", + "11 12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816 2022-08-25T15:44:07Z\n", + "12 8f80f24d49797595d8a18b8d4d1f59846fbf3fe1 2022-08-25T15:43:12Z\n", + "13 a9680a9e80e2c119d4aa41f90e5f40cef6e7df02 2022-08-21T09:31:21Z\n", + "14 1fd937e6c0ff8452f64152aec9632f0586f981d1 2022-08-21T09:29:01Z\n", + "15 50e92c677f4cd547c32bb9305269a762a9595074 2022-07-23T12:54:16Z\n", + "16 8732e7d4739a911e01f69d2c5bda82ca15ca68a9 2022-07-23T12:53:45Z\n", + "17 536e8833f3be4bbcffbfba43cd0b3043c5bf4068 2022-07-15T08:21:31Z\n", + "18 c90cc62006b35061c8db4b6f8da1b86de7650b23 2022-07-15T08:20:46Z\n", + "19 1a3be9a5d01a414854ff3bfacd5257c14adeefa2 2022-07-14T13:40:17Z\n", + "20 4863b8f0a070d70836dfbdc00bdf70aa7bb66345 2022-07-14T12:31:36Z\n", + "21 72a9e28a52c9629dd63dfad5f215cdc562e2fd7e 2022-07-14T12:28:41Z\n", + "22 dddc6afc29de895f4131693e17b3cc856ff23f8d 2022-07-14T12:27:25Z\n", + "23 d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe 2022-04-20T08:39:00Z\n", + "24 4d33035dd296826bbde200e17e5281910ac8be13 2022-04-20T08:37:44Z\n", + "25 4a13df461c42f970a099de77377f309995e7995c 2022-04-19T08:30:24Z\n", + "26 5e1d9764c8e96749bc11d52807eea1f7cc38ce5c 2022-04-14T11:01:59Z\n", + "27 ecd63776c6225af0cf8bcc9fa9c6998d3129fb14 2022-04-14T11:00:42Z\n", + "28 cada41a6bfac5a8bf88ed2107a0b856b9b9785a0 2022-03-15T14:27:59Z\n", + "29 db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d 2022-03-15T14:27:31Z\n", + "30 207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e 2022-02-10T15:36:59Z\n", + "31 8ab272288e81fa9a49fd4765579b066c2c03cc10 2022-02-10T15:26:19Z\n", + "32 d4c410cd91618a8a74a7763598e499f6a8aa168b 2022-01-14T15:25:23Z\n", + "33 db46d2a0209720c5fa0ab8b03403f7009f69d2d6 2022-01-14T12:27:52Z\n", + "34 ebb3d8c6aa383567bb7794cacf745b1c82efbb7f 2022-01-14T12:27:35Z\n", + "35 fc712c5a4dd00f534e845982a29f46a14e22b292 2022-01-14T08:05:58Z\n", + "36 239bf68f403bbdbabb43fa0277040f4dda2b2aff 2022-01-13T20:33:38Z\n", + "37 d86bdd9983424c792691269fa10e6f022ccf2191 2022-01-13T16:53:03Z\n", + "38 06ceff14b2db920d458dc337b1100dced992e627 2022-01-13T16:52:15Z\n", + "39 3fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd 2022-01-04T09:19:02Z\n", + "40 0a051a92e0baab68b4c77ad13b8cb4e7eaa2f810 2022-01-04T09:18:33Z\n", + "41 d76ca7d75891aac95205b2cc11510ce8ef41a65a 2021-11-11T11:49:26Z\n", + "42 178c68cca2044cbd5032cbff23e4c06b738aed2a 2021-11-11T11:48:57Z\n", + "43 2d19daf42ff47c2f2c4e7970dd77ed32768eb38f 2021-10-26T11:32:30Z\n", + "44 3f46491c0fe377fd922412c93137efb6a4879b05 2021-10-26T11:15:21Z\n", + "45 8dd2a04b1d7e559f5124d012f0a729d0180216d2 2021-10-11T15:33:46Z\n", + "46 ddda708da9f2f41ae7e7d122973c2eddb141d949 2021-10-11T15:29:53Z\n", + "47 8d77b3dd93959663d58ae5b626289d0746edd0e7 2021-10-11T15:13:57Z\n", + "48 d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea 2021-10-11T14:48:05Z\n", + "49 389b55b1c3ef58ea1136fb0c9e6d2ce73038d655 2021-10-11T13:16:09Z\n", + "50 5088650c144e6a3ee05ac1e015d487a86095e665 2021-10-11T12:53:06Z\n", + "51 7dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd 2021-09-17T15:49:56Z\n", + "52 352813601073bcdc3d5eb08d1be07be904276a9a 2021-09-17T15:49:21Z\n", + "53 6c613ce701353f42b6c911801f0aeb78413fac9d 2021-08-20T15:25:29Z\n", + "54 8f892b2675dd6464f526d32f1ad1d3fd62fa3b03 2021-08-20T14:12:35Z\n", + "55 a231c3b065a7b9c86c92915f3b4bcf012ec36175 2021-07-20T11:13:25Z\n", + "56 1a3007e433ed7ccffc4a2c7c36ed8013d6105d4d 2021-07-20T10:34:46Z\n", + "57 4a8d93e8607e2413edee71da68cade6e11249fea 2021-07-19T11:27:37Z\n", + "58 d3fd19b04d4256600e8c90a27a2b317d61514c86 2021-07-19T11:23:30Z\n", + "59 fd63f824a11eead1f7fb39457251c7ac27ad1f93 2021-07-15T08:07:22Z\n", + "60 89b57278869c7b46fbbaf94825ed336328f3567a 2021-07-15T07:59:10Z\n", + "61 6fdf7add8bb4f416bd8660e6b826d80e8759dfe1 2021-07-14T06:19:03Z\n", + "62 b57de92d58da2bf787e5af34ab09e0e8ee6cea6c 2021-07-13T16:04:39Z\n", + "63 4cdd02827d4b0d2d8834e45baaea621663e0d632 2021-07-13T16:03:03Z\n", + "64 e6a3f0cc5710495475e7b959ee82c649978843f8 2021-07-13T16:01:36Z\n", + "65 487c16cf1fb4ecb7eca25d752cb7178f10232a81 2021-07-13T15:56:41Z\n", + "66 3d7382f653a9707c3b2d683e7d58b7b9d0b4e747 2021-07-13T15:55:05Z\n", + "67 d155ba8553d2c53580ef66fbf733484dc2857a30 2021-07-13T15:52:43Z\n", + "68 af805f6d40beef639fac7bd1c5f7789893c2f662 2021-06-15T10:31:53Z\n", + "69 ee9eefb21f878da18c72bdc04b31d6ec5d8ade06 2021-06-15T10:26:05Z\n", + "70 a736a7ce4d98a0fefa3f4c01ab6da4641de74254 2021-06-11T12:09:09Z\n", + "71 2f6878301950dfea48a8147a6d8bd012be564063 2021-06-11T12:09:02Z\n", + "72 c0510fbaa138ba0029d8aa2949c282fb2dfbf926 2021-06-11T10:16:30Z\n", + "73 4871325c65b16d5fc1a0fb406ebf539c3cbcf054 2021-05-12T10:36:19Z\n", + "74 cf8ed2bb499fae21b7ed1544177ef3589a8246b1 2021-05-12T10:34:43Z\n", + "75 b9d97ef45c56394528534bd1a2ec7d2d78523354 2021-04-26T14:25:47Z\n", + "76 8da1eca28db8f6ae0db7e7134d334a10eb96f6a3 2021-04-21T11:17:00Z\n", + "77 a2726dfb0cde8a5d567ce9a115d4f254f2af3851 2021-04-21T11:16:49Z\n", + "78 410e5b8bd76c20bcde2cd0e92c78713e00160c28 2021-04-21T10:36:48Z\n", + "79 4240ef4cf17e503c44b177ff0f7b740258ac2e70 2021-04-12T15:17:44Z\n", + "80 7b746d5303493fb53214ceffbd9b365c8ef4a126 2021-04-01T10:04:53Z\n", + "81 94cfe2ac15af0171f1397984ca0d1968a6931eb3 2021-03-03T08:53:38Z\n", + "82 c37267e1ee355c619a3058da5a7135e19e6522de 2021-03-03T08:44:58Z\n", + "83 d531c3422f806634d35d611b0296b6a34815c4e7 2021-02-26T12:30:20Z\n", + "84 0fa36435691cac5651bee2ffe511f73ccd98c367 2021-02-17T15:20:04Z\n", + "85 a65fcca04e51ba8a9d48d2ee676be1de7ff01746 2021-02-17T14:31:37Z\n", + "86 3182e0793150de4ffb34da2774991cb24e49a191 2021-02-04T13:05:18Z\n", + "87 5df8b7ecec0e249df5485dd622efed8b35758540 2021-02-04T11:29:37Z\n", + "88 0fe7efb1a6217accb8d53cbb17a633f406459fbb 2021-02-03T16:38:34Z\n", + "89 2fdc68adf24ec02e28fba5657c7f31c1373491d2 2021-02-03T16:11:39Z\n", + "90 cf7eb4a9781fbdac6dba9ff7b345db2d88748eea 2021-02-03T16:10:25Z\n", + "91 6305b6c83d61553d3606905f7b41a0cb99f92f0d 2021-01-19T16:57:57Z\n", + "92 25c04ea1f8bc4a9915417998310a563bdbb48115 2021-01-19T16:56:43Z\n", + "93 cee977d8d1ca2d1f03f384e53227d56e5143b6fb 2021-01-19T11:56:18Z\n", + "94 03ded64be9b290287ab0101891ca6f338861c2fe 2021-01-19T11:55:23Z\n", + "95 4ec1022d2645f3c2e5018971473710d568078eae 2021-01-18T16:31:24Z\n", + "96 7a7d58ec828e8295dd4011f4f460f961a2b4428a 2021-01-18T15:04:52Z\n", + "97 67c8be545d4483ac34f86fdd57f09135a4820be6 2021-01-18T15:02:25Z\n", + "98 441eeff3a9fca4934fc9cf54e2e4a914bb43ef10 2021-01-18T14:51:08Z\n", + "99 a02389393dbd06e2e936d7d2438ef7e48fb969d9 2020-12-16T11:39:46Z" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/0.1-process-updated-releases.ipynb b/notebooks/0.1-process-updated-releases.ipynb new file mode 100644 index 00000000..64e9ebff --- /dev/null +++ b/notebooks/0.1-process-updated-releases.ipynb @@ -0,0 +1,1422 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub API EDA\n", + "\n", + "0.1 - Add schema validation for app state\n", + "\n", + "Application state variables:\n", + "```json\n", + "{\n", + " \"current_release\": \"3510\",\n", + " \"last_processed_commit\": {\n", + " \"sha\": \"5f2c562056f8ffa89aeea0631f2a52300ee0de17\",\n", + " \"date\": \"2023-01-13T10:04:48Z\"\n", + " },\n", + " \"tracked_assets\": [\n", + " \"hla.dat\",\n", + " \"msf/\"\n", + " ]\n", + "}\n", + "```\n", + "\n", + "## Strategy\n", + "### 1. App State\n", + "* Fetch app state from S3\n", + "* Update app state at end of execution\n", + "### 2. Asset Processing\n", + "* For each tracked asset:\n", + " * Fetch the commits for the asset\n", + " * Filter by the last processed commit date\n", + "* Merge the commits for each asset into a single list → array of commits\n", + "* If array is empty (no commits found), exit\n", + "* If array is not empty (commits are found)\n", + " * Get the release version for each commit ← needs strategy\n", + " * Build the release" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')\n", + "import logging\n", + "from datetime import datetime\n", + "utc_now = datetime.utcnow().strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", + "import re\n", + "import json\n", + "import boto3\n", + "import pandas as pd\n", + "from src.utils.types import (\n", + " ExecutionHistoryItem\n", + ")\n", + "from src.utils import (\n", + " read_source_config,\n", + " write_source_config,\n", + " get_commits_for_asset,\n", + " get_repo_asset,\n", + " get_commits,\n", + " flatten_json,\n", + " merge_release_version_with_commit\n", + ")\n", + "\n", + "# logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Pandas display options\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n", + "AWS_REGION = os.environ[\"AWS_REGION\"]\n", + "DATA_BUCKET_NAME = os.environ[\"DATA_BUCKET_NAME\"]\n", + "PIPELINE_CONFIG_S3_PATH = os.environ[\"PIPELINE_CONFIG_S3_PATH\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### App State" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# source config file in S3 must be up to date \n", + "source_config = read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Asset Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RepositoryConfig(owner='ANHIG', name='IMGTHLA', url='https://github.com/ANHIG/IMGTHLA', tracked_assets=['hla.dat', 'msf/'], default_input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), execution_history=[ExecutionHistoryItem(version=3480, date_utc='2022-04-14T11:00:42Z', commit=CommitDetails(sha='ecd63776c6225af0cf8bcc9fa9c6998d3129fb14', date_utc='2022-04-14T11:00:42Z', url='url'), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='SUCCESS'), ExecutionHistoryItem(version=3470, date_utc='2022-01-13T16:52:15Z', commit=CommitDetails(sha='06ceff14b2db920d458dc337b1100dced992e627', date_utc='2022-01-13T16:52:15Z', url='url'), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='SUCCESS')])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_repo = source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME]\n", + "source_repo" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ExecutionHistoryItem(version=3520, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING')" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get most recent item in source_repo.execution_history by source_repo.execution_history[].date_utc \n", + "last_processed_commit = max(source_repo.execution_history, key=lambda x: x.date_utc)\n", + "last_processed_commit" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "asset_commits = []\n", + "for asset in source_repo.tracked_assets:\n", + " commits = get_commits_for_asset(\n", + " owner=GITHUB_REPOSITORY_OWNER,\n", + " repo=GITHUB_REPOSITORY_NAME,\n", + " path=asset,\n", + " since=last_processed_commit.date_utc\n", + " )\n", + "\n", + " if not commits:\n", + " logger.warning(f\"No commits found for asset {asset} with sha {last_processed_commit.commit_sha} since {last_processed_commit.date}\")\n", + " else: \n", + " asset_commits.extend(commits)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'sha': 'def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'node_id': 'C_kwDOAXZz6NoAKGRlZjM3NmRjNjk1NWIzMzliMTdmMGE0Yjg0MGU4MGViNmI5Yzc0NGI',\n", + " 'commit': {'author': {'name': 'xeniageorgiouAN',\n", + " 'email': 'xenia.georgiou@anthonynolan.org',\n", + " 'date': '2023-04-17T16:01:01Z'},\n", + " 'committer': {'name': 'xeniageorgiouAN',\n", + " 'email': 'xenia.georgiou@anthonynolan.org',\n", + " 'date': '2023-04-17T16:01:01Z'},\n", + " 'message': 'IPD-IMGT/HLA Release 3.52.0',\n", + " 'tree': {'sha': 'e9ffca9666e355b1285a0c6a42951f6a28ea7f90',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/e9ffca9666e355b1285a0c6a42951f6a28ea7f90'},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'comment_count': 0,\n", + " 'verification': {'verified': False,\n", + " 'reason': 'unsigned',\n", + " 'signature': None,\n", + " 'payload': None}},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'comments_url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b/comments',\n", + " 'author': {'login': 'xeniageorgiouAN',\n", + " 'id': 55495460,\n", + " 'node_id': 'MDQ6VXNlcjU1NDk1NDYw',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/55495460?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/xeniageorgiouAN',\n", + " 'html_url': 'https://github.com/xeniageorgiouAN',\n", + " 'followers_url': 'https://api.github.com/users/xeniageorgiouAN/followers',\n", + " 'following_url': 'https://api.github.com/users/xeniageorgiouAN/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/xeniageorgiouAN/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/xeniageorgiouAN/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/xeniageorgiouAN/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/xeniageorgiouAN/orgs',\n", + " 'repos_url': 'https://api.github.com/users/xeniageorgiouAN/repos',\n", + " 'events_url': 'https://api.github.com/users/xeniageorgiouAN/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/xeniageorgiouAN/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'committer': {'login': 'xeniageorgiouAN',\n", + " 'id': 55495460,\n", + " 'node_id': 'MDQ6VXNlcjU1NDk1NDYw',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/55495460?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/xeniageorgiouAN',\n", + " 'html_url': 'https://github.com/xeniageorgiouAN',\n", + " 'followers_url': 'https://api.github.com/users/xeniageorgiouAN/followers',\n", + " 'following_url': 'https://api.github.com/users/xeniageorgiouAN/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/xeniageorgiouAN/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/xeniageorgiouAN/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/xeniageorgiouAN/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/xeniageorgiouAN/orgs',\n", + " 'repos_url': 'https://api.github.com/users/xeniageorgiouAN/repos',\n", + " 'events_url': 'https://api.github.com/users/xeniageorgiouAN/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/xeniageorgiouAN/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'parents': [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "asset_commits[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out commits before last processed commit\n", + "unique_shas = list(set([(item[\"sha\"], item[\"commit\"][\"author\"][\"date\"]) for item in asset_commits \\\n", + " if datetime.strptime(item[\"commit\"][\"author\"][\"date\"], \"%Y-%m-%dT%H:%M:%SZ\") > datetime.strptime(last_processed_commit.date_utc, \"%Y-%m-%dT%H:%M:%SZ\")]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),\n", + " ('4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " ('2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " ('2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),\n", + " ('8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " ('1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " ('72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z')]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_shas.sort(key=lambda x: datetime.strptime(x[1], \"%Y-%m-%dT%H:%M:%SZ\"), reverse=True)\n", + "unique_shas" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Getting release version for sha def376dc6955b339b17f0a4b840e80eb6b9c744b and date 2023-04-17T16:01:01Z\n", + "INFO:__main__:Getting release version for sha 4486f5c623705c6a14d9eeaba7d155cff30cdb43 and date 2023-01-12T14:36:43Z\n", + "INFO:__main__:Getting release version for sha 2d38d3313229fdc5f8aa00052a2db21b35be3d2d and date 2022-10-14T08:46:01Z\n", + "INFO:__main__:Getting release version for sha 2c631a4b61d529ff1c0635750888f6f6d79c2703 and date 2022-10-13T12:58:37Z\n", + "INFO:__main__:Getting release version for sha 8f80f24d49797595d8a18b8d4d1f59846fbf3fe1 and date 2022-08-25T15:43:12Z\n", + "INFO:__main__:Getting release version for sha 1a3be9a5d01a414854ff3bfacd5257c14adeefa2 and date 2022-07-14T13:40:17Z\n", + "INFO:__main__:Getting release version for sha 72a9e28a52c9629dd63dfad5f215cdc562e2fd7e and date 2022-07-14T12:28:41Z\n" + ] + } + ], + "source": [ + "# get the releases for each unique commit from Allelelist.txt\n", + "# can produce duplicate release versions if the same release is updated more than once\n", + "# makes the assumption that the release version branch is up to date for that release, since the build process targets the release version branch and not the specific commit sha\n", + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "release_versions = []\n", + "release_versions_dicts = []\n", + "for sha, date in unique_shas:\n", + " logger.info(f\"Getting release version for sha {sha} and date {date}\")\n", + " allele_list = get_repo_asset(\n", + " GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", sha\n", + " )\n", + " release_version = int(\n", + " re.search(release_version_re, allele_list).group(1).replace(\".\", \"\")\n", + " )\n", + "\n", + " # TODO use a dict instead of tuple and append directly to source_config\n", + " release_versions.append((release_version, sha, date))\n", + " release_versions_dicts.append({\n", + " release_version, \n", + " sha, \n", + " date\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Release versions by sha:\n", + "[[3520, \"def376dc6955b339b17f0a4b840e80eb6b9c744b\", \"2023-04-17T16:01:01Z\"], [3510, \"4486f5c623705c6a14d9eeaba7d155cff30cdb43\", \"2023-01-12T14:36:43Z\"], [3500, \"2d38d3313229fdc5f8aa00052a2db21b35be3d2d\", \"2022-10-14T08:46:01Z\"], [3500, \"2c631a4b61d529ff1c0635750888f6f6d79c2703\", \"2022-10-13T12:58:37Z\"], [3490, \"8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\", \"2022-08-25T15:43:12Z\"], [3490, \"1a3be9a5d01a414854ff3bfacd5257c14adeefa2\", \"2022-07-14T13:40:17Z\"], [3480, \"72a9e28a52c9629dd63dfad5f215cdc562e2fd7e\", \"2022-07-14T12:28:41Z\"]]\n" + ] + } + ], + "source": [ + "# write this to source config file (must be sorted by data descending), but the status should be labeled firsts\n", + "release_versions.sort(key=lambda x: x[2], reverse=True)\n", + "logger.info(f\"Release versions by sha:\\n{json.dumps(release_versions)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found unique releases [[3480, \"72a9e28a52c9629dd63dfad5f215cdc562e2fd7e\"], [3490, \"1a3be9a5d01a414854ff3bfacd5257c14adeefa2\"], [3490, \"8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\"], [3500, \"2c631a4b61d529ff1c0635750888f6f6d79c2703\"], [3500, \"2d38d3313229fdc5f8aa00052a2db21b35be3d2d\"], [3510, \"4486f5c623705c6a14d9eeaba7d155cff30cdb43\"], [3520, \"def376dc6955b339b17f0a4b840e80eb6b9c744b\"]]\n" + ] + } + ], + "source": [ + "# send this array to state machine\n", + "unique_release_versions = list(set([(version[0], version[1]) for version in release_versions]))\n", + "unique_release_versions.sort(reverse=False)\n", + "logger.info(f\"Found unique releases {json.dumps(unique_release_versions)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# take the most recent release version and get the commit sha for that release version\n", + "# this is the commit sha that will be used to build the release version\n", + "commits_pending_build = []\n", + "for version, sha in unique_release_versions:\n", + " items_for_version = [item for item in release_versions if item[0] == version]\n", + " most_recent_item = max(items_for_version, key=lambda x: x[2])\n", + " commits_pending_build.append(most_recent_item)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z')]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_pending_build" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z')]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_pending_build" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# the commits that aren't in commits_pending are labeled as skipped\n", + "commits_skipped = set(release_versions).difference(set(commits_pending_build))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{(3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z')}" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_skipped" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history_pending = [\n", + " ExecutionHistoryItem(\n", + " **{\n", + " \"version\": item[0],\n", + " \"date_utc\": utc_now, # TODO switch to Lambda context variable\n", + " \"commit\": {\n", + " \"sha\": item[1],\n", + " \"date_utc\": item[2],\n", + " \"url\": \"\"\n", + " },\n", + " \"status\": \"PENDING\",\n", + " \"input_parameters\": source_repo.default_input_parameters.dict(),\n", + " }\n", + " )\n", + " for item in commits_pending_build\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history_skipped = [\n", + " ExecutionHistoryItem(\n", + " **{\n", + " \"version\": item[0],\n", + " \"date_utc\": utc_now, # TODO switch to Lambda context variable\n", + " \"commit\": {\n", + " \"sha\": item[1],\n", + " \"date_utc\": item[2],\n", + " \"url\": \"\"\n", + " },\n", + " \"status\": \"SKIPPED\"\n", + " }\n", + " )\n", + " for item in commits_skipped\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history = execution_history_pending + execution_history_skipped" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ExecutionHistoryItem(version=3520, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3510, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='4486f5c623705c6a14d9eeaba7d155cff30cdb43', date_utc='2023-01-12T14:36:43Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3500, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='2d38d3313229fdc5f8aa00052a2db21b35be3d2d', date_utc='2022-10-14T08:46:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3500, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='2c631a4b61d529ff1c0635750888f6f6d79c2703', date_utc='2022-10-13T12:58:37Z', url=''), input_parameters=None, status='SKIPPED'),\n", + " ExecutionHistoryItem(version=3490, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', date_utc='2022-08-25T15:43:12Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3490, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='1a3be9a5d01a414854ff3bfacd5257c14adeefa2', date_utc='2022-07-14T13:40:17Z', url=''), input_parameters=None, status='SKIPPED'),\n", + " ExecutionHistoryItem(version=3480, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', date_utc='2022-07-14T12:28:41Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING')]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort by commit date descending\n", + "execution_history.sort(key=lambda x: x.commit.date_utc, reverse=True)\n", + "execution_history" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# warning: extend method is not idempotent, so if you run this cell more than once, you will get duplicate commits\n", + "source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history.extend(execution_history)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# sort by execution date descending\n", + "source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history.sort(key=lambda x: x.date_utc, reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# print(json.dumps([item.dict() for item in source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history], indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e'),\n", + " (3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b')]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# send these to the state machine\n", + "unique_release_versions" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# write to json file\n", + "with open(\"source-config.json\", \"w\") as f:\n", + " json.dump(source_config.dict(), f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),\n", + " (3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z')]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", '06ceff14b2db920d458dc337b1100dced992e627')\n", + "release_version = int(re.search(release_version_re, allele_list).group(1).replace(\".\", \"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3470" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "release_version" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# sort commits by most recent date on [].commit.author.date\n", + "asset_commits.sort(key=lambda x: x[\"commit\"][\"author\"][\"date\"], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "all_commits = get_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "select_keys = [\"sha\", \"commit\"]\n", + "\n", + "# filter by select_keys\n", + "all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[[\"sha\", \"commit.author.date\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shacommit.author.date
038398a75e9762ff070d8e9bd714d074332646cd72023-04-17T16:03:52Z
1def376dc6955b339b17f0a4b840e80eb6b9c744b2023-04-17T16:01:01Z
25f2c562056f8ffa89aeea0631f2a52300ee0de172023-01-13T10:04:48Z
34b8432c7d56121c84d6ef1d75a1c7185c628c13d2023-01-12T14:47:00Z
44486f5c623705c6a14d9eeaba7d155cff30cdb432023-01-12T14:36:43Z
550b790037030d958b662085c3f4cf34ba72a32ec2022-12-14T10:02:54Z
636220a1c5c2d6954f4873a552544cc0e55b61d0a2022-12-14T10:02:28Z
7e941759874365cb152a3562c22d10847d10db3262022-10-14T08:47:37Z
82d38d3313229fdc5f8aa00052a2db21b35be3d2d2022-10-14T08:46:01Z
91ce31fc9e2805034578eff60a269c02176f032522022-10-13T13:06:12Z
102c631a4b61d529ff1c0635750888f6f6d79c27032022-10-13T12:58:37Z
1112b4b94bb49b4640fa48a1182cc9b1e1fbf7e8162022-08-25T15:44:07Z
128f80f24d49797595d8a18b8d4d1f59846fbf3fe12022-08-25T15:43:12Z
13a9680a9e80e2c119d4aa41f90e5f40cef6e7df022022-08-21T09:31:21Z
141fd937e6c0ff8452f64152aec9632f0586f981d12022-08-21T09:29:01Z
1550e92c677f4cd547c32bb9305269a762a95950742022-07-23T12:54:16Z
168732e7d4739a911e01f69d2c5bda82ca15ca68a92022-07-23T12:53:45Z
17536e8833f3be4bbcffbfba43cd0b3043c5bf40682022-07-15T08:21:31Z
18c90cc62006b35061c8db4b6f8da1b86de7650b232022-07-15T08:20:46Z
191a3be9a5d01a414854ff3bfacd5257c14adeefa22022-07-14T13:40:17Z
204863b8f0a070d70836dfbdc00bdf70aa7bb663452022-07-14T12:31:36Z
2172a9e28a52c9629dd63dfad5f215cdc562e2fd7e2022-07-14T12:28:41Z
22dddc6afc29de895f4131693e17b3cc856ff23f8d2022-07-14T12:27:25Z
23d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe2022-04-20T08:39:00Z
244d33035dd296826bbde200e17e5281910ac8be132022-04-20T08:37:44Z
254a13df461c42f970a099de77377f309995e7995c2022-04-19T08:30:24Z
265e1d9764c8e96749bc11d52807eea1f7cc38ce5c2022-04-14T11:01:59Z
27ecd63776c6225af0cf8bcc9fa9c6998d3129fb142022-04-14T11:00:42Z
28cada41a6bfac5a8bf88ed2107a0b856b9b9785a02022-03-15T14:27:59Z
29db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d2022-03-15T14:27:31Z
30207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e2022-02-10T15:36:59Z
318ab272288e81fa9a49fd4765579b066c2c03cc102022-02-10T15:26:19Z
32d4c410cd91618a8a74a7763598e499f6a8aa168b2022-01-14T15:25:23Z
33db46d2a0209720c5fa0ab8b03403f7009f69d2d62022-01-14T12:27:52Z
34ebb3d8c6aa383567bb7794cacf745b1c82efbb7f2022-01-14T12:27:35Z
35fc712c5a4dd00f534e845982a29f46a14e22b2922022-01-14T08:05:58Z
36239bf68f403bbdbabb43fa0277040f4dda2b2aff2022-01-13T20:33:38Z
37d86bdd9983424c792691269fa10e6f022ccf21912022-01-13T16:53:03Z
3806ceff14b2db920d458dc337b1100dced992e6272022-01-13T16:52:15Z
393fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd2022-01-04T09:19:02Z
400a051a92e0baab68b4c77ad13b8cb4e7eaa2f8102022-01-04T09:18:33Z
41d76ca7d75891aac95205b2cc11510ce8ef41a65a2021-11-11T11:49:26Z
42178c68cca2044cbd5032cbff23e4c06b738aed2a2021-11-11T11:48:57Z
432d19daf42ff47c2f2c4e7970dd77ed32768eb38f2021-10-26T11:32:30Z
443f46491c0fe377fd922412c93137efb6a4879b052021-10-26T11:15:21Z
458dd2a04b1d7e559f5124d012f0a729d0180216d22021-10-11T15:33:46Z
46ddda708da9f2f41ae7e7d122973c2eddb141d9492021-10-11T15:29:53Z
478d77b3dd93959663d58ae5b626289d0746edd0e72021-10-11T15:13:57Z
48d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea2021-10-11T14:48:05Z
49389b55b1c3ef58ea1136fb0c9e6d2ce73038d6552021-10-11T13:16:09Z
505088650c144e6a3ee05ac1e015d487a86095e6652021-10-11T12:53:06Z
517dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd2021-09-17T15:49:56Z
52352813601073bcdc3d5eb08d1be07be904276a9a2021-09-17T15:49:21Z
536c613ce701353f42b6c911801f0aeb78413fac9d2021-08-20T15:25:29Z
548f892b2675dd6464f526d32f1ad1d3fd62fa3b032021-08-20T14:12:35Z
55a231c3b065a7b9c86c92915f3b4bcf012ec361752021-07-20T11:13:25Z
561a3007e433ed7ccffc4a2c7c36ed8013d6105d4d2021-07-20T10:34:46Z
574a8d93e8607e2413edee71da68cade6e11249fea2021-07-19T11:27:37Z
58d3fd19b04d4256600e8c90a27a2b317d61514c862021-07-19T11:23:30Z
59fd63f824a11eead1f7fb39457251c7ac27ad1f932021-07-15T08:07:22Z
6089b57278869c7b46fbbaf94825ed336328f3567a2021-07-15T07:59:10Z
616fdf7add8bb4f416bd8660e6b826d80e8759dfe12021-07-14T06:19:03Z
62b57de92d58da2bf787e5af34ab09e0e8ee6cea6c2021-07-13T16:04:39Z
634cdd02827d4b0d2d8834e45baaea621663e0d6322021-07-13T16:03:03Z
64e6a3f0cc5710495475e7b959ee82c649978843f82021-07-13T16:01:36Z
65487c16cf1fb4ecb7eca25d752cb7178f10232a812021-07-13T15:56:41Z
663d7382f653a9707c3b2d683e7d58b7b9d0b4e7472021-07-13T15:55:05Z
67d155ba8553d2c53580ef66fbf733484dc2857a302021-07-13T15:52:43Z
68af805f6d40beef639fac7bd1c5f7789893c2f6622021-06-15T10:31:53Z
69ee9eefb21f878da18c72bdc04b31d6ec5d8ade062021-06-15T10:26:05Z
70a736a7ce4d98a0fefa3f4c01ab6da4641de742542021-06-11T12:09:09Z
712f6878301950dfea48a8147a6d8bd012be5640632021-06-11T12:09:02Z
72c0510fbaa138ba0029d8aa2949c282fb2dfbf9262021-06-11T10:16:30Z
734871325c65b16d5fc1a0fb406ebf539c3cbcf0542021-05-12T10:36:19Z
74cf8ed2bb499fae21b7ed1544177ef3589a8246b12021-05-12T10:34:43Z
75b9d97ef45c56394528534bd1a2ec7d2d785233542021-04-26T14:25:47Z
768da1eca28db8f6ae0db7e7134d334a10eb96f6a32021-04-21T11:17:00Z
77a2726dfb0cde8a5d567ce9a115d4f254f2af38512021-04-21T11:16:49Z
78410e5b8bd76c20bcde2cd0e92c78713e00160c282021-04-21T10:36:48Z
794240ef4cf17e503c44b177ff0f7b740258ac2e702021-04-12T15:17:44Z
807b746d5303493fb53214ceffbd9b365c8ef4a1262021-04-01T10:04:53Z
8194cfe2ac15af0171f1397984ca0d1968a6931eb32021-03-03T08:53:38Z
82c37267e1ee355c619a3058da5a7135e19e6522de2021-03-03T08:44:58Z
83d531c3422f806634d35d611b0296b6a34815c4e72021-02-26T12:30:20Z
840fa36435691cac5651bee2ffe511f73ccd98c3672021-02-17T15:20:04Z
85a65fcca04e51ba8a9d48d2ee676be1de7ff017462021-02-17T14:31:37Z
863182e0793150de4ffb34da2774991cb24e49a1912021-02-04T13:05:18Z
875df8b7ecec0e249df5485dd622efed8b357585402021-02-04T11:29:37Z
880fe7efb1a6217accb8d53cbb17a633f406459fbb2021-02-03T16:38:34Z
892fdc68adf24ec02e28fba5657c7f31c1373491d22021-02-03T16:11:39Z
90cf7eb4a9781fbdac6dba9ff7b345db2d88748eea2021-02-03T16:10:25Z
916305b6c83d61553d3606905f7b41a0cb99f92f0d2021-01-19T16:57:57Z
9225c04ea1f8bc4a9915417998310a563bdbb481152021-01-19T16:56:43Z
93cee977d8d1ca2d1f03f384e53227d56e5143b6fb2021-01-19T11:56:18Z
9403ded64be9b290287ab0101891ca6f338861c2fe2021-01-19T11:55:23Z
954ec1022d2645f3c2e5018971473710d568078eae2021-01-18T16:31:24Z
967a7d58ec828e8295dd4011f4f460f961a2b4428a2021-01-18T15:04:52Z
9767c8be545d4483ac34f86fdd57f09135a4820be62021-01-18T15:02:25Z
98441eeff3a9fca4934fc9cf54e2e4a914bb43ef102021-01-18T14:51:08Z
99a02389393dbd06e2e936d7d2438ef7e48fb969d92020-12-16T11:39:46Z
\n", + "
" + ], + "text/plain": [ + " sha commit.author.date\n", + "0 38398a75e9762ff070d8e9bd714d074332646cd7 2023-04-17T16:03:52Z\n", + "1 def376dc6955b339b17f0a4b840e80eb6b9c744b 2023-04-17T16:01:01Z\n", + "2 5f2c562056f8ffa89aeea0631f2a52300ee0de17 2023-01-13T10:04:48Z\n", + "3 4b8432c7d56121c84d6ef1d75a1c7185c628c13d 2023-01-12T14:47:00Z\n", + "4 4486f5c623705c6a14d9eeaba7d155cff30cdb43 2023-01-12T14:36:43Z\n", + "5 50b790037030d958b662085c3f4cf34ba72a32ec 2022-12-14T10:02:54Z\n", + "6 36220a1c5c2d6954f4873a552544cc0e55b61d0a 2022-12-14T10:02:28Z\n", + "7 e941759874365cb152a3562c22d10847d10db326 2022-10-14T08:47:37Z\n", + "8 2d38d3313229fdc5f8aa00052a2db21b35be3d2d 2022-10-14T08:46:01Z\n", + "9 1ce31fc9e2805034578eff60a269c02176f03252 2022-10-13T13:06:12Z\n", + "10 2c631a4b61d529ff1c0635750888f6f6d79c2703 2022-10-13T12:58:37Z\n", + "11 12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816 2022-08-25T15:44:07Z\n", + "12 8f80f24d49797595d8a18b8d4d1f59846fbf3fe1 2022-08-25T15:43:12Z\n", + "13 a9680a9e80e2c119d4aa41f90e5f40cef6e7df02 2022-08-21T09:31:21Z\n", + "14 1fd937e6c0ff8452f64152aec9632f0586f981d1 2022-08-21T09:29:01Z\n", + "15 50e92c677f4cd547c32bb9305269a762a9595074 2022-07-23T12:54:16Z\n", + "16 8732e7d4739a911e01f69d2c5bda82ca15ca68a9 2022-07-23T12:53:45Z\n", + "17 536e8833f3be4bbcffbfba43cd0b3043c5bf4068 2022-07-15T08:21:31Z\n", + "18 c90cc62006b35061c8db4b6f8da1b86de7650b23 2022-07-15T08:20:46Z\n", + "19 1a3be9a5d01a414854ff3bfacd5257c14adeefa2 2022-07-14T13:40:17Z\n", + "20 4863b8f0a070d70836dfbdc00bdf70aa7bb66345 2022-07-14T12:31:36Z\n", + "21 72a9e28a52c9629dd63dfad5f215cdc562e2fd7e 2022-07-14T12:28:41Z\n", + "22 dddc6afc29de895f4131693e17b3cc856ff23f8d 2022-07-14T12:27:25Z\n", + "23 d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe 2022-04-20T08:39:00Z\n", + "24 4d33035dd296826bbde200e17e5281910ac8be13 2022-04-20T08:37:44Z\n", + "25 4a13df461c42f970a099de77377f309995e7995c 2022-04-19T08:30:24Z\n", + "26 5e1d9764c8e96749bc11d52807eea1f7cc38ce5c 2022-04-14T11:01:59Z\n", + "27 ecd63776c6225af0cf8bcc9fa9c6998d3129fb14 2022-04-14T11:00:42Z\n", + "28 cada41a6bfac5a8bf88ed2107a0b856b9b9785a0 2022-03-15T14:27:59Z\n", + "29 db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d 2022-03-15T14:27:31Z\n", + "30 207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e 2022-02-10T15:36:59Z\n", + "31 8ab272288e81fa9a49fd4765579b066c2c03cc10 2022-02-10T15:26:19Z\n", + "32 d4c410cd91618a8a74a7763598e499f6a8aa168b 2022-01-14T15:25:23Z\n", + "33 db46d2a0209720c5fa0ab8b03403f7009f69d2d6 2022-01-14T12:27:52Z\n", + "34 ebb3d8c6aa383567bb7794cacf745b1c82efbb7f 2022-01-14T12:27:35Z\n", + "35 fc712c5a4dd00f534e845982a29f46a14e22b292 2022-01-14T08:05:58Z\n", + "36 239bf68f403bbdbabb43fa0277040f4dda2b2aff 2022-01-13T20:33:38Z\n", + "37 d86bdd9983424c792691269fa10e6f022ccf2191 2022-01-13T16:53:03Z\n", + "38 06ceff14b2db920d458dc337b1100dced992e627 2022-01-13T16:52:15Z\n", + "39 3fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd 2022-01-04T09:19:02Z\n", + "40 0a051a92e0baab68b4c77ad13b8cb4e7eaa2f810 2022-01-04T09:18:33Z\n", + "41 d76ca7d75891aac95205b2cc11510ce8ef41a65a 2021-11-11T11:49:26Z\n", + "42 178c68cca2044cbd5032cbff23e4c06b738aed2a 2021-11-11T11:48:57Z\n", + "43 2d19daf42ff47c2f2c4e7970dd77ed32768eb38f 2021-10-26T11:32:30Z\n", + "44 3f46491c0fe377fd922412c93137efb6a4879b05 2021-10-26T11:15:21Z\n", + "45 8dd2a04b1d7e559f5124d012f0a729d0180216d2 2021-10-11T15:33:46Z\n", + "46 ddda708da9f2f41ae7e7d122973c2eddb141d949 2021-10-11T15:29:53Z\n", + "47 8d77b3dd93959663d58ae5b626289d0746edd0e7 2021-10-11T15:13:57Z\n", + "48 d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea 2021-10-11T14:48:05Z\n", + "49 389b55b1c3ef58ea1136fb0c9e6d2ce73038d655 2021-10-11T13:16:09Z\n", + "50 5088650c144e6a3ee05ac1e015d487a86095e665 2021-10-11T12:53:06Z\n", + "51 7dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd 2021-09-17T15:49:56Z\n", + "52 352813601073bcdc3d5eb08d1be07be904276a9a 2021-09-17T15:49:21Z\n", + "53 6c613ce701353f42b6c911801f0aeb78413fac9d 2021-08-20T15:25:29Z\n", + "54 8f892b2675dd6464f526d32f1ad1d3fd62fa3b03 2021-08-20T14:12:35Z\n", + "55 a231c3b065a7b9c86c92915f3b4bcf012ec36175 2021-07-20T11:13:25Z\n", + "56 1a3007e433ed7ccffc4a2c7c36ed8013d6105d4d 2021-07-20T10:34:46Z\n", + "57 4a8d93e8607e2413edee71da68cade6e11249fea 2021-07-19T11:27:37Z\n", + "58 d3fd19b04d4256600e8c90a27a2b317d61514c86 2021-07-19T11:23:30Z\n", + "59 fd63f824a11eead1f7fb39457251c7ac27ad1f93 2021-07-15T08:07:22Z\n", + "60 89b57278869c7b46fbbaf94825ed336328f3567a 2021-07-15T07:59:10Z\n", + "61 6fdf7add8bb4f416bd8660e6b826d80e8759dfe1 2021-07-14T06:19:03Z\n", + "62 b57de92d58da2bf787e5af34ab09e0e8ee6cea6c 2021-07-13T16:04:39Z\n", + "63 4cdd02827d4b0d2d8834e45baaea621663e0d632 2021-07-13T16:03:03Z\n", + "64 e6a3f0cc5710495475e7b959ee82c649978843f8 2021-07-13T16:01:36Z\n", + "65 487c16cf1fb4ecb7eca25d752cb7178f10232a81 2021-07-13T15:56:41Z\n", + "66 3d7382f653a9707c3b2d683e7d58b7b9d0b4e747 2021-07-13T15:55:05Z\n", + "67 d155ba8553d2c53580ef66fbf733484dc2857a30 2021-07-13T15:52:43Z\n", + "68 af805f6d40beef639fac7bd1c5f7789893c2f662 2021-06-15T10:31:53Z\n", + "69 ee9eefb21f878da18c72bdc04b31d6ec5d8ade06 2021-06-15T10:26:05Z\n", + "70 a736a7ce4d98a0fefa3f4c01ab6da4641de74254 2021-06-11T12:09:09Z\n", + "71 2f6878301950dfea48a8147a6d8bd012be564063 2021-06-11T12:09:02Z\n", + "72 c0510fbaa138ba0029d8aa2949c282fb2dfbf926 2021-06-11T10:16:30Z\n", + "73 4871325c65b16d5fc1a0fb406ebf539c3cbcf054 2021-05-12T10:36:19Z\n", + "74 cf8ed2bb499fae21b7ed1544177ef3589a8246b1 2021-05-12T10:34:43Z\n", + "75 b9d97ef45c56394528534bd1a2ec7d2d78523354 2021-04-26T14:25:47Z\n", + "76 8da1eca28db8f6ae0db7e7134d334a10eb96f6a3 2021-04-21T11:17:00Z\n", + "77 a2726dfb0cde8a5d567ce9a115d4f254f2af3851 2021-04-21T11:16:49Z\n", + "78 410e5b8bd76c20bcde2cd0e92c78713e00160c28 2021-04-21T10:36:48Z\n", + "79 4240ef4cf17e503c44b177ff0f7b740258ac2e70 2021-04-12T15:17:44Z\n", + "80 7b746d5303493fb53214ceffbd9b365c8ef4a126 2021-04-01T10:04:53Z\n", + "81 94cfe2ac15af0171f1397984ca0d1968a6931eb3 2021-03-03T08:53:38Z\n", + "82 c37267e1ee355c619a3058da5a7135e19e6522de 2021-03-03T08:44:58Z\n", + "83 d531c3422f806634d35d611b0296b6a34815c4e7 2021-02-26T12:30:20Z\n", + "84 0fa36435691cac5651bee2ffe511f73ccd98c367 2021-02-17T15:20:04Z\n", + "85 a65fcca04e51ba8a9d48d2ee676be1de7ff01746 2021-02-17T14:31:37Z\n", + "86 3182e0793150de4ffb34da2774991cb24e49a191 2021-02-04T13:05:18Z\n", + "87 5df8b7ecec0e249df5485dd622efed8b35758540 2021-02-04T11:29:37Z\n", + "88 0fe7efb1a6217accb8d53cbb17a633f406459fbb 2021-02-03T16:38:34Z\n", + "89 2fdc68adf24ec02e28fba5657c7f31c1373491d2 2021-02-03T16:11:39Z\n", + "90 cf7eb4a9781fbdac6dba9ff7b345db2d88748eea 2021-02-03T16:10:25Z\n", + "91 6305b6c83d61553d3606905f7b41a0cb99f92f0d 2021-01-19T16:57:57Z\n", + "92 25c04ea1f8bc4a9915417998310a563bdbb48115 2021-01-19T16:56:43Z\n", + "93 cee977d8d1ca2d1f03f384e53227d56e5143b6fb 2021-01-19T11:56:18Z\n", + "94 03ded64be9b290287ab0101891ca6f338861c2fe 2021-01-19T11:55:23Z\n", + "95 4ec1022d2645f3c2e5018971473710d568078eae 2021-01-18T16:31:24Z\n", + "96 7a7d58ec828e8295dd4011f4f460f961a2b4428a 2021-01-18T15:04:52Z\n", + "97 67c8be545d4483ac34f86fdd57f09135a4820be6 2021-01-18T15:02:25Z\n", + "98 441eeff3a9fca4934fc9cf54e2e4a914bb43ef10 2021-01-18T14:51:08Z\n", + "99 a02389393dbd06e2e936d7d2438ef7e48fb969d9 2020-12-16T11:39:46Z" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/0.2-github-api-eda.ipynb b/notebooks/0.2-github-api-eda.ipynb new file mode 100644 index 00000000..a77fecef --- /dev/null +++ b/notebooks/0.2-github-api-eda.ipynb @@ -0,0 +1,1432 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub API EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/8s/9sb2nsn913q7b4zz75fd_qf00000gn/T/ipykernel_12407/494911564.py:7: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv, find_dotenv\n", + "load_dotenv('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.env.nmdpf');\n", + "from itertools import chain, starmap\n", + "import json\n", + "import requests\n", + "import pandas as pd\n", + "\n", + "# Pandas display options\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/notebooks'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Environment variables\n", + "AWS_REGION = os.environ[\"AWS_REGION\"] \n", + "GITHUB_PERSONAL_ACCESS_TOKEN = os.environ[\"GITHUB_PERSONAL_ACCESS_TOKEN\"]\n", + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def flatten_json(dictionary, sep='.', skip_fields=[]):\n", + " \"\"\"Flatten a nested json file. For a list of dictionaries, use this\n", + " inside a for loop before converting to pandas DataFrame.\"\"\"\n", + "\n", + " def unpack(parent_key, parent_value):\n", + " \"\"\"Unpack one level of nesting in json file\"\"\"\n", + " # Unpack one level only!!!\n", + " \n", + " if isinstance(parent_value, dict):\n", + " for key, value in parent_value.items():\n", + " temp1 = parent_key + sep + key\n", + " yield temp1, value\n", + " elif isinstance(parent_value, list):\n", + " i = 0 \n", + " for value in parent_value:\n", + " temp2 = parent_key + sep +str(i) \n", + " i += 1\n", + " yield temp2, value\n", + " else:\n", + " yield parent_key, parent_value \n", + "\n", + "\n", + " # Keep iterating until the termination condition is satisfied\n", + " while True:\n", + " # Keep unpacking the json file until all values are atomic elements (not dictionary or list)\n", + " dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))\n", + " # Terminate condition: not any value in the json file is dictionary or list\n", + " if not any(isinstance(value, dict) for value in dictionary.values()) and \\\n", + " not any(isinstance(value, list) for value in dictionary.values()):\n", + " break\n", + "\n", + " return dictionary\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_commits(owner, repo, per_page=100):\n", + " \"\"\"Return a list of GitHub commits for the specified repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/commits?per_page={per_page}'\n", + "\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_commit(owner, repo, commit_sha):\n", + " \"\"\"Return the commit for the specified repository and commit SHA\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/commits/{commit_sha}'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def get_branches(owner, repo):\n", + " \"\"\"Fetch branches for a GitHub repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/branches'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + " branches = response.json()\n", + "\n", + " return branches" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def get_branch(owner, repo, branch_name):\n", + " \"\"\"Fetch branches for a GitHub repository\"\"\"\n", + "\n", + " base_url = 'https://api.github.com'\n", + "\n", + " # Endpoint\n", + " endpoint = f'/repos/{owner}/{repo}/branches/{branch_name}'\n", + " url = base_url + endpoint\n", + "\n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + " branches = response.json()\n", + "\n", + " return branches" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to fetch pull requests\n", + "def fetch_pull_requests(owner, repo):\n", + " url = f\"https://api.github.com/repos/{owner}/{repo}/pulls?state=all\"\n", + " \n", + " # Headers\n", + " headers = {\n", + " 'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',\n", + " 'Content-Type': 'application/json',\n", + " 'Accept': 'application/vnd.github.v3+json',\n", + " 'X-GitHub-Api-Version': '2022-11-28'\n", + " }\n", + " response = requests.get(url, headers=headers)\n", + "\n", + " if response.status_code == 200:\n", + " return response.json()\n", + " else:\n", + " print(f\"Error: {response.status_code}\")\n", + " return []" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Branches" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from pygethub import list_branches, GitHubPaginator" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all branches\n", + "paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)\n", + "pages = paginator.get_paginator(list_branches, owner=GITHUB_REPOSITORY_OWNER, repo=GITHUB_REPOSITORY_NAME)\n", + "all_branches = list(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'pygethub' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[22], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpygethub\u001b[49m\u001b[38;5;241m.\u001b[39m__version__\n", + "\u001b[0;31mNameError\u001b[0m: name 'pygethub' is not defined" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_branches)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "branches_df = pd.DataFrame([flatten_json(branch) for branch in all_branches])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "branches_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "releases = list(branches_df['name'].unique())[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "300,310,320,330,340,350,360,370,380,390,3100,3110,3120,3130,3140,3150,3160,3170,3180,3190,3200,3210,3220,3230,3240,3250,3260,3270,3280,3290,3300,3310,3320,3330,3340,3350,3360,3370,3380,3390,3400,3410,3420,3430,3440,3450,3460,3470,3480,3490,3500,3510,3520,3530\n" + ] + } + ], + "source": [ + "print(\",\".join(releases))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pull Requests" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# # Fetch pull requests and load them into a DataFrame\n", + "# pull_requests = fetch_pull_requests(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, GITHUB_PERSONAL_ACCESS_TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# # pull_requests_df = pd.DataFrame(pull_requests)\n", + "# pull_requests_df = pd.DataFrame([flatten_json(pull_request) for pull_request in pull_requests])" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# pull_requests_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "# pull_requests_df[\"title\"].unique()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Commits" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "commits = get_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(commits)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'38398a75e9762ff070d8e9bd714d074332646cd7'" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits[0]['sha']" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'sha': '38398a75e9762ff070d8e9bd714d074332646cd7',\n", + " 'node_id': 'C_kwDOAXZz6NoAKDM4Mzk4YTc1ZTk3NjJmZjA3MGQ4ZTliZDcxNGQwNzQzMzI2NDZjZDc',\n", + " 'commit': {'author': {'name': 'xeniageorgiouAN',\n", + " 'email': 'xenia.georgiou@anthonynolan.org',\n", + " 'date': '2023-04-17T16:03:52Z'},\n", + " 'committer': {'name': 'GitHub',\n", + " 'email': 'noreply@github.com',\n", + " 'date': '2023-04-17T16:03:52Z'},\n", + " 'message': 'Merge pull request #334 from ANHIG/3520\\n\\nIPD-IMGT/HLA Release 3.52.0',\n", + " 'tree': {'sha': 'e9ffca9666e355b1285a0c6a42951f6a28ea7f90',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/e9ffca9666e355b1285a0c6a42951f6a28ea7f90'},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/38398a75e9762ff070d8e9bd714d074332646cd7',\n", + " 'comment_count': 0,\n", + " 'verification': {'verified': True,\n", + " 'reason': 'valid',\n", + " 'signature': '-----BEGIN PGP SIGNATURE-----\\n\\nwsBcBAABCAAQBQJkPW3oCRBK7hj4Ov3rIwAAkxkIAI1/KmiZW1yRfwe8e++ROHQo\\ncPbR+1PipOfVXcgaJVyK5vfiR1d+1DsLph0k+VurKcVIn5As5Pc+XSi7ImeeNFzV\\nrk51rZXuSwzznIuuRptXJsnhcrfnsg61FII47Qqhh2YcMkFFg7TMDlRZca0qWCJB\\nHZ2+MbVkXZjJf1ZmBTq6z/gHVRcApwfGlD6JVwNkppGC0EgXAJm+5/qguB7CNGgZ\\nDXK1PLm9p5qhiC2zdg+ploGe53NKYEFgib1N3qhOc7hAi6p60bWoFw2gdFzMjmSu\\nqdtdSdIvbm0aobJDjiabTHLq85ojQ40RiPR4WBc65BXhXsmYSsTwnQvgdiDkXKY=\\n=ajY2\\n-----END PGP SIGNATURE-----\\n',\n", + " 'payload': 'tree e9ffca9666e355b1285a0c6a42951f6a28ea7f90\\nparent 5f2c562056f8ffa89aeea0631f2a52300ee0de17\\nparent def376dc6955b339b17f0a4b840e80eb6b9c744b\\nauthor xeniageorgiouAN 1681747432 +0100\\ncommitter GitHub 1681747432 +0100\\n\\nMerge pull request #334 from ANHIG/3520\\n\\nIPD-IMGT/HLA Release 3.52.0'}},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/38398a75e9762ff070d8e9bd714d074332646cd7',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7',\n", + " 'comments_url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/38398a75e9762ff070d8e9bd714d074332646cd7/comments',\n", + " 'author': {'login': 'xeniageorgiouAN',\n", + " 'id': 55495460,\n", + " 'node_id': 'MDQ6VXNlcjU1NDk1NDYw',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/55495460?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/xeniageorgiouAN',\n", + " 'html_url': 'https://github.com/xeniageorgiouAN',\n", + " 'followers_url': 'https://api.github.com/users/xeniageorgiouAN/followers',\n", + " 'following_url': 'https://api.github.com/users/xeniageorgiouAN/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/xeniageorgiouAN/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/xeniageorgiouAN/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/xeniageorgiouAN/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/xeniageorgiouAN/orgs',\n", + " 'repos_url': 'https://api.github.com/users/xeniageorgiouAN/repos',\n", + " 'events_url': 'https://api.github.com/users/xeniageorgiouAN/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/xeniageorgiouAN/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'committer': {'login': 'web-flow',\n", + " 'id': 19864447,\n", + " 'node_id': 'MDQ6VXNlcjE5ODY0NDQ3',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/19864447?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/web-flow',\n", + " 'html_url': 'https://github.com/web-flow',\n", + " 'followers_url': 'https://api.github.com/users/web-flow/followers',\n", + " 'following_url': 'https://api.github.com/users/web-flow/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/web-flow/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/web-flow/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/web-flow/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/web-flow/orgs',\n", + " 'repos_url': 'https://api.github.com/users/web-flow/repos',\n", + " 'events_url': 'https://api.github.com/users/web-flow/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/web-flow/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'parents': [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'},\n", + " {'sha': 'def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b'}],\n", + " 'stats': {'total': 0, 'additions': 0, 'deletions': 0},\n", + " 'files': []}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_commit(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"38398a75e9762ff070d8e9bd714d074332646cd7\")" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "# load dataframe from json and drop columns that are all null\n", + "commits_df = (\n", + " pd.DataFrame([flatten_json(commit) for commit in commits])\n", + " # pd.DataFrame(commits)\n", + " .dropna(axis=1, how='all')\n", + " # .sort_values(by='commit.committer.date', ascending=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['sha', 'node_id', 'commit.author.name', 'commit.author.email',\n", + " 'commit.author.date', 'commit.committer.name', 'commit.committer.email',\n", + " 'commit.committer.date', 'commit.message', 'commit.tree.sha',\n", + " 'commit.tree.url', 'commit.url', 'commit.comment_count',\n", + " 'commit.verification.verified', 'commit.verification.reason',\n", + " 'commit.verification.signature', 'commit.verification.payload', 'url',\n", + " 'html_url', 'comments_url', 'author.login', 'author.id',\n", + " 'author.node_id', 'author.avatar_url', 'author.gravatar_id',\n", + " 'author.url', 'author.html_url', 'author.followers_url',\n", + " 'author.following_url', 'author.gists_url', 'author.starred_url',\n", + " 'author.subscriptions_url', 'author.organizations_url',\n", + " 'author.repos_url', 'author.events_url', 'author.received_events_url',\n", + " 'author.type', 'author.site_admin', 'committer.login', 'committer.id',\n", + " 'committer.node_id', 'committer.avatar_url', 'committer.gravatar_id',\n", + " 'committer.url', 'committer.html_url', 'committer.followers_url',\n", + " 'committer.following_url', 'committer.gists_url',\n", + " 'committer.starred_url', 'committer.subscriptions_url',\n", + " 'committer.organizations_url', 'committer.repos_url',\n", + " 'committer.events_url', 'committer.received_events_url',\n", + " 'committer.type', 'committer.site_admin', 'parents.0.sha',\n", + " 'parents.0.url', 'parents.0.html_url', 'parents.1.sha', 'parents.1.url',\n", + " 'parents.1.html_url'],\n", + " dtype='object')" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "cols = [\n", + " \"sha\",\n", + " \"commit.author.date\",\n", + " \"commit.message\"\n", + "]\n", + "commits_df = commits_df[cols]\n", + "\n", + "# convert date column to datetime\n", + "commits_df[\"date\"] = pd.to_datetime(commits_df[\"commit.author.date\"])\n", + "commits_df.drop(columns=[\"commit.author.date\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "# # Commit SHAs do not correspond to branch SHAs, so this approach doesn't work\n", + "# # join the branches df to the commits df on branches['commit.sha']\n", + "# commits_df = commits_df.merge(branches_df, left_on='sha', right_on='commit.sha', how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "# commits_df.iloc[:4, :]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38398a75e9762ff070d8e9bd714d074332646cd7\n", + "def376dc6955b339b17f0a4b840e80eb6b9c744b\n", + "5f2c562056f8ffa89aeea0631f2a52300ee0de17\n", + "4b8432c7d56121c84d6ef1d75a1c7185c628c13d\n", + "4486f5c623705c6a14d9eeaba7d155cff30cdb43\n", + "50b790037030d958b662085c3f4cf34ba72a32ec\n", + "36220a1c5c2d6954f4873a552544cc0e55b61d0a\n", + "e941759874365cb152a3562c22d10847d10db326\n", + "2d38d3313229fdc5f8aa00052a2db21b35be3d2d\n", + "1ce31fc9e2805034578eff60a269c02176f03252\n", + "2c631a4b61d529ff1c0635750888f6f6d79c2703\n", + "12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816\n", + "8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\n", + "a9680a9e80e2c119d4aa41f90e5f40cef6e7df02\n", + "1fd937e6c0ff8452f64152aec9632f0586f981d1\n", + "50e92c677f4cd547c32bb9305269a762a9595074\n", + "8732e7d4739a911e01f69d2c5bda82ca15ca68a9\n", + "536e8833f3be4bbcffbfba43cd0b3043c5bf4068\n", + "c90cc62006b35061c8db4b6f8da1b86de7650b23\n", + "1a3be9a5d01a414854ff3bfacd5257c14adeefa2\n" + ] + } + ], + "source": [ + "commit_refs = []\n", + "limit = 20\n", + "for idx, sha in enumerate(commits_df.iloc[:20, :][\"sha\"].to_list()):\n", + " # fetch commit\n", + " print(sha)\n", + " commit = get_commit(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, sha)\n", + " commit_refs.append(commit)\n", + "\n", + " if idx+1 == limit:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "# commit_refs_df = pd.DataFrame([flatten_json(commit) for commit in commit_refs])\n", + "commit_refs_df = pd.DataFrame(commit_refs)[[\"sha\", \"files\", \"parents\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}, {'sha': 'def376dc6955b339b17f0a4b840e80eb6b9c744b', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b'}]\n", + "1 [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}]\n", + "2 [{'sha': '50b790037030d958b662085c3f4cf34ba72a32ec', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50b790037030d958b662085c3f4cf34ba72a32ec', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec'}, {'sha': '4b8432c7d56121c84d6ef1d75a1c7185c628c13d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4b8432c7d56121c84d6ef1d75a1c7185c628c13d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4b8432c7d56121c84d6ef1d75a1c7185c628c13d'}]\n", + "3 [{'sha': '4486f5c623705c6a14d9eeaba7d155cff30cdb43', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4486f5c623705c6a14d9eeaba7d155cff30cdb43', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4486f5c623705c6a14d9eeaba7d155cff30cdb43'}]\n", + "4 [{'sha': '50b790037030d958b662085c3f4cf34ba72a32ec', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50b790037030d958b662085c3f4cf34ba72a32ec', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec'}]\n", + "5 [{'sha': 'e941759874365cb152a3562c22d10847d10db326', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e941759874365cb152a3562c22d10847d10db326', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/e941759874365cb152a3562c22d10847d10db326'}, {'sha': '36220a1c5c2d6954f4873a552544cc0e55b61d0a', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/36220a1c5c2d6954f4873a552544cc0e55b61d0a', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/36220a1c5c2d6954f4873a552544cc0e55b61d0a'}]\n", + "6 [{'sha': '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d'}]\n", + "7 [{'sha': '1ce31fc9e2805034578eff60a269c02176f03252', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1ce31fc9e2805034578eff60a269c02176f03252', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1ce31fc9e2805034578eff60a269c02176f03252'}, {'sha': '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d'}]\n", + "8 [{'sha': '2c631a4b61d529ff1c0635750888f6f6d79c2703', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2c631a4b61d529ff1c0635750888f6f6d79c2703', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703'}]\n", + "9 [{'sha': '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816'}, {'sha': '2c631a4b61d529ff1c0635750888f6f6d79c2703', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2c631a4b61d529ff1c0635750888f6f6d79c2703', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703'}]\n", + "10 [{'sha': '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816'}]\n", + "11 [{'sha': 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02'}, {'sha': '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1'}]\n", + "12 [{'sha': '1fd937e6c0ff8452f64152aec9632f0586f981d1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1fd937e6c0ff8452f64152aec9632f0586f981d1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1'}]\n", + "13 [{'sha': '50e92c677f4cd547c32bb9305269a762a9595074', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50e92c677f4cd547c32bb9305269a762a9595074', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50e92c677f4cd547c32bb9305269a762a9595074'}, {'sha': '1fd937e6c0ff8452f64152aec9632f0586f981d1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1fd937e6c0ff8452f64152aec9632f0586f981d1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1'}]\n", + "14 [{'sha': '8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9'}]\n", + "15 [{'sha': '536e8833f3be4bbcffbfba43cd0b3043c5bf4068', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/536e8833f3be4bbcffbfba43cd0b3043c5bf4068', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/536e8833f3be4bbcffbfba43cd0b3043c5bf4068'}, {'sha': '8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9'}]\n", + "16 [{'sha': 'c90cc62006b35061c8db4b6f8da1b86de7650b23', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/c90cc62006b35061c8db4b6f8da1b86de7650b23', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23'}]\n", + "17 [{'sha': 'd1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe'}, {'sha': 'c90cc62006b35061c8db4b6f8da1b86de7650b23', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/c90cc62006b35061c8db4b6f8da1b86de7650b23', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23'}]\n", + "18 [{'sha': '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1a3be9a5d01a414854ff3bfacd5257c14adeefa2', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1a3be9a5d01a414854ff3bfacd5257c14adeefa2'}, {'sha': 'd1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe'}]\n", + "19 [{'sha': '4863b8f0a070d70836dfbdc00bdf70aa7bb66345', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4863b8f0a070d70836dfbdc00bdf70aa7bb66345', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4863b8f0a070d70836dfbdc00bdf70aa7bb66345'}]\n", + "Name: parents, dtype: object" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commit_refs_df.loc[:20, :]['parents']" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}, {'sha': 'def376dc6955b339b17f0a4b840e80eb6b9c744b', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b'}]\n", + "1 [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}]\n", + "2 [{'sha': '50b790037030d958b662085c3f4cf34ba72a32ec', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50b790037030d958b662085c3f4cf34ba72a32ec', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec'}, {'sha': '4b8432c7d56121c84d6ef1d75a1c7185c628c13d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4b8432c7d56121c84d6ef1d75a1c7185c628c13d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4b8432c7d56121c84d6ef1d75a1c7185c628c13d'}]\n", + "3 [{'sha': '4486f5c623705c6a14d9eeaba7d155cff30cdb43', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4486f5c623705c6a14d9eeaba7d155cff30cdb43', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4486f5c623705c6a14d9eeaba7d155cff30cdb43'}]\n", + "4 [{'sha': '50b790037030d958b662085c3f4cf34ba72a32ec', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50b790037030d958b662085c3f4cf34ba72a32ec', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec'}]\n", + "5 [{'sha': 'e941759874365cb152a3562c22d10847d10db326', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e941759874365cb152a3562c22d10847d10db326', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/e941759874365cb152a3562c22d10847d10db326'}, {'sha': '36220a1c5c2d6954f4873a552544cc0e55b61d0a', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/36220a1c5c2d6954f4873a552544cc0e55b61d0a', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/36220a1c5c2d6954f4873a552544cc0e55b61d0a'}]\n", + "6 [{'sha': '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d'}]\n", + "7 [{'sha': '1ce31fc9e2805034578eff60a269c02176f03252', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1ce31fc9e2805034578eff60a269c02176f03252', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1ce31fc9e2805034578eff60a269c02176f03252'}, {'sha': '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2d38d3313229fdc5f8aa00052a2db21b35be3d2d', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d'}]\n", + "8 [{'sha': '2c631a4b61d529ff1c0635750888f6f6d79c2703', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2c631a4b61d529ff1c0635750888f6f6d79c2703', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703'}]\n", + "9 [{'sha': '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816'}, {'sha': '2c631a4b61d529ff1c0635750888f6f6d79c2703', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/2c631a4b61d529ff1c0635750888f6f6d79c2703', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703'}]\n", + "10 [{'sha': '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816'}]\n", + "11 [{'sha': 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02'}, {'sha': '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1'}]\n", + "12 [{'sha': '1fd937e6c0ff8452f64152aec9632f0586f981d1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1fd937e6c0ff8452f64152aec9632f0586f981d1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1'}]\n", + "13 [{'sha': '50e92c677f4cd547c32bb9305269a762a9595074', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/50e92c677f4cd547c32bb9305269a762a9595074', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/50e92c677f4cd547c32bb9305269a762a9595074'}, {'sha': '1fd937e6c0ff8452f64152aec9632f0586f981d1', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1fd937e6c0ff8452f64152aec9632f0586f981d1', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1'}]\n", + "14 [{'sha': '8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9'}]\n", + "15 [{'sha': '536e8833f3be4bbcffbfba43cd0b3043c5bf4068', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/536e8833f3be4bbcffbfba43cd0b3043c5bf4068', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/536e8833f3be4bbcffbfba43cd0b3043c5bf4068'}, {'sha': '8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/8732e7d4739a911e01f69d2c5bda82ca15ca68a9', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9'}]\n", + "16 [{'sha': 'c90cc62006b35061c8db4b6f8da1b86de7650b23', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/c90cc62006b35061c8db4b6f8da1b86de7650b23', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23'}]\n", + "17 [{'sha': 'd1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe'}, {'sha': 'c90cc62006b35061c8db4b6f8da1b86de7650b23', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/c90cc62006b35061c8db4b6f8da1b86de7650b23', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23'}]\n", + "18 [{'sha': '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/1a3be9a5d01a414854ff3bfacd5257c14adeefa2', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/1a3be9a5d01a414854ff3bfacd5257c14adeefa2'}, {'sha': 'd1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe'}]\n", + "19 [{'sha': '4863b8f0a070d70836dfbdc00bdf70aa7bb66345', 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/4863b8f0a070d70836dfbdc00bdf70aa7bb66345', 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/4863b8f0a070d70836dfbdc00bdf70aa7bb66345'}]\n", + "Name: parents, dtype: object" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commit_refs_df['parents']" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sha 100 non-null object \n", + " 1 commit.message 100 non-null object \n", + " 2 date 100 non-null datetime64[ns, UTC]\n", + "dtypes: datetime64[ns, UTC](1), object(2)\n", + "memory usage: 2.5+ KB\n" + ] + } + ], + "source": [ + "commits_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 20 entries, 0 to 19\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sha 20 non-null object\n", + " 1 files 20 non-null object\n", + " 2 parents 20 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 608.0+ bytes\n" + ] + } + ], + "source": [ + "commit_refs_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "# join commit dfs on sha\n", + "commits_joined_df = commits_df.merge(commit_refs_df, on='sha', how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['sha', 'commit.message', 'date', 'files', 'parents'], dtype='object')" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_joined_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "commits_joined_df.loc[:10, ['sha', 'commit.message', 'date', 'parents', 'files']].to_csv(\"commits.csv\", index=False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'sha': 'e941759874365cb152a3562c22d10847d10db326',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e941759874365cb152a3562c22d10847d10db326',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/e941759874365cb152a3562c22d10847d10db326'},\n", + " {'sha': '36220a1c5c2d6954f4873a552544cc0e55b61d0a',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/36220a1c5c2d6954f4873a552544cc0e55b61d0a',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/36220a1c5c2d6954f4873a552544cc0e55b61d0a'}]" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_joined_df['parents'].iloc[5]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "# # # group commits by day\n", + "# # commit_refs_df = pd.DataFrame(commit_refs)\n", + "# # commit_refs_df[\"date\"] = pd.to_datetime(commit_refs_df[\"commit\"].apply(lambda x: x[\"author\"][\"date\"]))\n", + "\n", + "# # # extract message field\n", + "# # commit_refs_df[\"message\"] = commit_refs_df[\"commit\"].apply(lambda x: x[\"message\"])\n", + "\n", + "# # convert commit.author.date to datetime and sort by date descending\n", + "# commit_refs_df[\"date\"] = pd.to_datetime(commit_refs_df[\"commit.author.date\"])\n", + "\n", + "# # extract filename from files field\n", + "# commit_refs_df[\"files\"] = commit_refs_df[\"files\"].apply(lambda x: [file[\"filename\"] for file in x])\n", + "\n", + "# # select columns\n", + "# cols = [\n", + "# \"sha\",\n", + "# \"date\",\n", + "# \"files\",\n", + "# ]\n", + "\n", + "# commit_refs_df = commit_refs_df[cols].explode(\"files\").dropna().sort_values(\"date\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "# commit_refs_df.groupby(pd.Grouper(key=\"date\", freq=\"D\"))[[\"sha\", \"files\"]].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# flatten_json(commit_refs_df.loc[0, \"commit\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GitHub Graph Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import networkx as nx" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 7 entries, 0 to 6\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sha 7 non-null object \n", + " 1 commit.message 7 non-null object \n", + " 2 date 7 non-null datetime64[ns, UTC]\n", + " 3 files 7 non-null object \n", + " 4 parents 7 non-null object \n", + "dtypes: datetime64[ns, UTC](1), object(4)\n", + "memory usage: 408.0+ bytes\n" + ] + } + ], + "source": [ + "commits_joined_df.iloc[:7, :].info()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "# commits_joined_df.loc[:7, ['sha', 'commit', 'parents', 'files']].to_csv(\"commits_joined.csv\", index=False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [], + "source": [ + "commits_exp_df = commits_joined_df.explode(\"files\").explode(\"parents\")\n", + "commits_exp_df.columns = ['sha', 'commit.message', 'date', 'file', 'parent']" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [], + "source": [ + "# unnest the fields in the file column as their own columns with prefix\n", + "commits_exp_df = pd.json_normalize(commits_exp_df['file'], sep='.')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shafilenamestatusadditionsdeletionschangesblob_urlraw_urlcontents_urlprevious_filenamepatch
5df1a5591eff92be3a94c065884d8264d308513c6ihiw/hml/3.51.0_ReferenceSequences.fastarenamed0.00.00.0https://github.com/ANHIG/IMGTHLA/blob/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fastahttps://github.com/ANHIG/IMGTHLA/raw/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fastahttps://api.github.com/repos/ANHIG/IMGTHLA/contents/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fasta?ref=4b8432c7d56121c84d6ef1d75a1c7185c628c13dihiw/hml/3.50.0_ReferenceSequences.fastaNaN
6e23440821dc326f42b5444dbb1eac323650fae1eihiw/hml/3.51.0_Reference_Alleles.txtrenamed3.03.06.0https://github.com/ANHIG/IMGTHLA/blob/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txthttps://github.com/ANHIG/IMGTHLA/raw/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txthttps://api.github.com/repos/ANHIG/IMGTHLA/contents/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txt?ref=4b8432c7d56121c84d6ef1d75a1c7185c628c13dihiw/hml/3.50.0_Reference_Alleles.txt@@ -1,8 +1,8 @@\\n-# filename: 3.50.0_Reference_Alleles.txt\\n-# date: 2022-10-12\\n+# filename: 3.51.0_Reference_Alleles.txt\\n+# date: 2023-01-12\\n # version: 1.0\\n # author: Ben Matern <B.M.Matern@umcutrecht.nl>\\n-IPD-IMGT/HLA Database 3.50.0 Accession Number\\tLocus\\tIPD-IMGT/HLA Database 3.50.0 Allele Name\\tDescription\\n+IPD-IMGT/HLA Database 3.51.0 Accession Number\\tLocus\\tIPD-IMGT/HLA Database 3.51.0 Allele Name\\tDescription\\n HLA00001\\tHLA-A\\tHLA-A*01:01:01:01\\tHLA-A Locus Reference;A*01 Reference\\n HLA00005\\tHLA-A\\tHLA-A*02:01:01:01\\tA*02 Reference\\n HLA00037\\tHLA-A\\tHLA-A*03:01:01:01\\tA*03 Reference
7NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
809190c395f26163a25fa85d0666b35f9f8846f49Allelelist_history.txtmodified2.02.04.0https://github.com/ANHIG/IMGTHLA/blob/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txthttps://github.com/ANHIG/IMGTHLA/raw/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txthttps://api.github.com/repos/ANHIG/IMGTHLA/contents/Allelelist_history.txt?ref=50b790037030d958b662085c3f4cf34ba72a32ecNaN@@ -1,6 +1,6 @@\\n # file: Allelelist_history.txt\\n-# date: \\n-# version: IPD-IMGT/HLA \\n+# date: 2022-10-12\\n+# version: IPD-IMGT/HLA 3.50.0\\n # origin: https://github.com/ANHIG/IMGTHLA/Allelelist_history.txt\\n # repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist_history.txt\\n # author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk)
909190c395f26163a25fa85d0666b35f9f8846f49Allelelist_history.txtmodified2.02.04.0https://github.com/ANHIG/IMGTHLA/blob/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txthttps://github.com/ANHIG/IMGTHLA/raw/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txthttps://api.github.com/repos/ANHIG/IMGTHLA/contents/Allelelist_history.txt?ref=50b790037030d958b662085c3f4cf34ba72a32ecNaN@@ -1,6 +1,6 @@\\n # file: Allelelist_history.txt\\n-# date: \\n-# version: IPD-IMGT/HLA \\n+# date: 2022-10-12\\n+# version: IPD-IMGT/HLA 3.50.0\\n # origin: https://github.com/ANHIG/IMGTHLA/Allelelist_history.txt\\n # repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist_history.txt\\n # author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk)
\n", + "
" + ], + "text/plain": [ + " sha \n", + "5 df1a5591eff92be3a94c065884d8264d308513c6 \\\n", + "6 e23440821dc326f42b5444dbb1eac323650fae1e \n", + "7 NaN \n", + "8 09190c395f26163a25fa85d0666b35f9f8846f49 \n", + "9 09190c395f26163a25fa85d0666b35f9f8846f49 \n", + "\n", + " filename status additions deletions \n", + "5 ihiw/hml/3.51.0_ReferenceSequences.fasta renamed 0.0 0.0 \\\n", + "6 ihiw/hml/3.51.0_Reference_Alleles.txt renamed 3.0 3.0 \n", + "7 NaN NaN NaN NaN \n", + "8 Allelelist_history.txt modified 2.0 2.0 \n", + "9 Allelelist_history.txt modified 2.0 2.0 \n", + "\n", + " changes \n", + "5 0.0 \\\n", + "6 6.0 \n", + "7 NaN \n", + "8 4.0 \n", + "9 4.0 \n", + "\n", + " blob_url \n", + "5 https://github.com/ANHIG/IMGTHLA/blob/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fasta \\\n", + "6 https://github.com/ANHIG/IMGTHLA/blob/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txt \n", + "7 NaN \n", + "8 https://github.com/ANHIG/IMGTHLA/blob/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txt \n", + "9 https://github.com/ANHIG/IMGTHLA/blob/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txt \n", + "\n", + " raw_url \n", + "5 https://github.com/ANHIG/IMGTHLA/raw/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fasta \\\n", + "6 https://github.com/ANHIG/IMGTHLA/raw/4b8432c7d56121c84d6ef1d75a1c7185c628c13d/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txt \n", + "7 NaN \n", + "8 https://github.com/ANHIG/IMGTHLA/raw/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txt \n", + "9 https://github.com/ANHIG/IMGTHLA/raw/50b790037030d958b662085c3f4cf34ba72a32ec/Allelelist_history.txt \n", + "\n", + " contents_url \n", + "5 https://api.github.com/repos/ANHIG/IMGTHLA/contents/ihiw%2Fhml%2F3.51.0_ReferenceSequences.fasta?ref=4b8432c7d56121c84d6ef1d75a1c7185c628c13d \\\n", + "6 https://api.github.com/repos/ANHIG/IMGTHLA/contents/ihiw%2Fhml%2F3.51.0_Reference_Alleles.txt?ref=4b8432c7d56121c84d6ef1d75a1c7185c628c13d \n", + "7 NaN \n", + "8 https://api.github.com/repos/ANHIG/IMGTHLA/contents/Allelelist_history.txt?ref=50b790037030d958b662085c3f4cf34ba72a32ec \n", + "9 https://api.github.com/repos/ANHIG/IMGTHLA/contents/Allelelist_history.txt?ref=50b790037030d958b662085c3f4cf34ba72a32ec \n", + "\n", + " previous_filename \n", + "5 ihiw/hml/3.50.0_ReferenceSequences.fasta \\\n", + "6 ihiw/hml/3.50.0_Reference_Alleles.txt \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "\n", + " patch \n", + "5 NaN \n", + "6 @@ -1,8 +1,8 @@\\n-# filename: 3.50.0_Reference_Alleles.txt\\n-# date: 2022-10-12\\n+# filename: 3.51.0_Reference_Alleles.txt\\n+# date: 2023-01-12\\n # version: 1.0\\n # author: Ben Matern \\n-IPD-IMGT/HLA Database 3.50.0 Accession Number\\tLocus\\tIPD-IMGT/HLA Database 3.50.0 Allele Name\\tDescription\\n+IPD-IMGT/HLA Database 3.51.0 Accession Number\\tLocus\\tIPD-IMGT/HLA Database 3.51.0 Allele Name\\tDescription\\n HLA00001\\tHLA-A\\tHLA-A*01:01:01:01\\tHLA-A Locus Reference;A*01 Reference\\n HLA00005\\tHLA-A\\tHLA-A*02:01:01:01\\tA*02 Reference\\n HLA00037\\tHLA-A\\tHLA-A*03:01:01:01\\tA*03 Reference \n", + "7 NaN \n", + "8 @@ -1,6 +1,6 @@\\n # file: Allelelist_history.txt\\n-# date: \\n-# version: IPD-IMGT/HLA \\n+# date: 2022-10-12\\n+# version: IPD-IMGT/HLA 3.50.0\\n # origin: https://github.com/ANHIG/IMGTHLA/Allelelist_history.txt\\n # repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist_history.txt\\n # author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk) \n", + "9 @@ -1,6 +1,6 @@\\n # file: Allelelist_history.txt\\n-# date: \\n-# version: IPD-IMGT/HLA \\n+# date: 2022-10-12\\n+# version: IPD-IMGT/HLA 3.50.0\\n # origin: https://github.com/ANHIG/IMGTHLA/Allelelist_history.txt\\n # repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist_history.txt\\n # author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk) " + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_exp_df.iloc[5:, :].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sha',\n", + " 'commit.message',\n", + " 'date',\n", + " 'file',\n", + " 'parent',\n", + " 0,\n", + " 'sha',\n", + " 'filename',\n", + " 'status',\n", + " 'additions',\n", + " 'deletions',\n", + " 'changes',\n", + " 'blob_url',\n", + " 'raw_url',\n", + " 'contents_url',\n", + " 'previous_filename',\n", + " 'patch']" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(commits_exp_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "G = nx.from_pandas_edgelist(commits_exp_df, source='sha', target='file', edge_attr=[])" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'graph': {},\n", + " '_node': {'38398a75e9762ff070d8e9bd714d074332646cd7': {'label': 'Merge pull request #334 from ANHIG/3520\\n\\nIPD-IMGT/HLA Release 3.52.0'},\n", + " 'def376dc6955b339b17f0a4b840e80eb6b9c744b': {'label': 'IPD-IMGT/HLA Release 3.52.0'},\n", + " '5f2c562056f8ffa89aeea0631f2a52300ee0de17': {'label': 'Merge pull request #325 from ANHIG/3510\\n\\n3510'},\n", + " '4b8432c7d56121c84d6ef1d75a1c7185c628c13d': {'label': 'IPD-IMGT/HLA Database Release 3.51.0\\n\\nUpdated the ihiw folder for 3.51.0'},\n", + " '4486f5c623705c6a14d9eeaba7d155cff30cdb43': {'label': 'IPD-IMGT/HLA Release 3.51.0\\n\\nUpdates for 3.51.0'},\n", + " '50b790037030d958b662085c3f4cf34ba72a32ec': {'label': 'Merge pull request #323 from ANHIG/3500\\n\\nCorrected missing date and version field in Allelelist_history.txt'},\n", + " '36220a1c5c2d6954f4873a552544cc0e55b61d0a': {'label': 'Corrected missing date and version field in Allelelist_history.txt'},\n", + " 'e941759874365cb152a3562c22d10847d10db326': {'label': 'Merge pull request #318 from ANHIG/3500\\n\\nhla.dat file for IPD-IMGT/HLA Release 3.50.0'},\n", + " '2d38d3313229fdc5f8aa00052a2db21b35be3d2d': {'label': 'hla.dat file for IPD-IMGT/HLA Release 3.50.0'},\n", + " '1ce31fc9e2805034578eff60a269c02176f03252': {'label': 'Merge pull request #317 from ANHIG/3500\\n\\nIPD-IMGT/HLA Release 3.50.0'},\n", + " '2c631a4b61d529ff1c0635750888f6f6d79c2703': {'label': 'IPD-IMGT/HLA Release 3.50.0'},\n", + " '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816': {'label': 'Merge pull request #315 from ANHIG/3490\\n\\nRemoval of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3…'},\n", + " '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1': {'label': 'Removal of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3\\'UTR in DQB1*03:480Q'},\n", + " 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02': {'label': 'Merge pull request #314 from ANHIG/3490\\n\\nCorrected DRA*01:01:01:01 P group (DRA*01:01P)'},\n", + " '1fd937e6c0ff8452f64152aec9632f0586f981d1': {'label': 'Corrected DRA*01:01:01:01 P group (DRA*01:01P)'},\n", + " '50e92c677f4cd547c32bb9305269a762a9595074': {'label': 'Merge pull request #310 from ANHIG/3490\\n\\nAdded additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt …'},\n", + " '8732e7d4739a911e01f69d2c5bda82ca15ca68a9': {'label': 'Added additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt due to DRB1*15:200:01:01N and DRB1*15:200:01:02N'},\n", + " '536e8833f3be4bbcffbfba43cd0b3043c5bf4068': {'label': 'Merge pull request #308 from ANHIG/3490\\n\\n3490'},\n", + " 'c90cc62006b35061c8db4b6f8da1b86de7650b23': {'label': \"Merge branch 'Latest' into 3490\"},\n", + " '1a3be9a5d01a414854ff3bfacd5257c14adeefa2': {'label': 'IPD-IMGT/HLA Release 3.49.0'}},\n", + " '_adj': {'38398a75e9762ff070d8e9bd714d074332646cd7': {},\n", + " 'def376dc6955b339b17f0a4b840e80eb6b9c744b': {},\n", + " '5f2c562056f8ffa89aeea0631f2a52300ee0de17': {},\n", + " '4b8432c7d56121c84d6ef1d75a1c7185c628c13d': {},\n", + " '4486f5c623705c6a14d9eeaba7d155cff30cdb43': {},\n", + " '50b790037030d958b662085c3f4cf34ba72a32ec': {},\n", + " '36220a1c5c2d6954f4873a552544cc0e55b61d0a': {},\n", + " 'e941759874365cb152a3562c22d10847d10db326': {},\n", + " '2d38d3313229fdc5f8aa00052a2db21b35be3d2d': {},\n", + " '1ce31fc9e2805034578eff60a269c02176f03252': {},\n", + " '2c631a4b61d529ff1c0635750888f6f6d79c2703': {},\n", + " '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816': {},\n", + " '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1': {},\n", + " 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02': {},\n", + " '1fd937e6c0ff8452f64152aec9632f0586f981d1': {},\n", + " '50e92c677f4cd547c32bb9305269a762a9595074': {},\n", + " '8732e7d4739a911e01f69d2c5bda82ca15ca68a9': {},\n", + " '536e8833f3be4bbcffbfba43cd0b3043c5bf4068': {},\n", + " 'c90cc62006b35061c8db4b6f8da1b86de7650b23': {},\n", + " '1a3be9a5d01a414854ff3bfacd5257c14adeefa2': {}},\n", + " '_succ': {'38398a75e9762ff070d8e9bd714d074332646cd7': {},\n", + " 'def376dc6955b339b17f0a4b840e80eb6b9c744b': {},\n", + " '5f2c562056f8ffa89aeea0631f2a52300ee0de17': {},\n", + " '4b8432c7d56121c84d6ef1d75a1c7185c628c13d': {},\n", + " '4486f5c623705c6a14d9eeaba7d155cff30cdb43': {},\n", + " '50b790037030d958b662085c3f4cf34ba72a32ec': {},\n", + " '36220a1c5c2d6954f4873a552544cc0e55b61d0a': {},\n", + " 'e941759874365cb152a3562c22d10847d10db326': {},\n", + " '2d38d3313229fdc5f8aa00052a2db21b35be3d2d': {},\n", + " '1ce31fc9e2805034578eff60a269c02176f03252': {},\n", + " '2c631a4b61d529ff1c0635750888f6f6d79c2703': {},\n", + " '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816': {},\n", + " '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1': {},\n", + " 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02': {},\n", + " '1fd937e6c0ff8452f64152aec9632f0586f981d1': {},\n", + " '50e92c677f4cd547c32bb9305269a762a9595074': {},\n", + " '8732e7d4739a911e01f69d2c5bda82ca15ca68a9': {},\n", + " '536e8833f3be4bbcffbfba43cd0b3043c5bf4068': {},\n", + " 'c90cc62006b35061c8db4b6f8da1b86de7650b23': {},\n", + " '1a3be9a5d01a414854ff3bfacd5257c14adeefa2': {}},\n", + " '_pred': {'38398a75e9762ff070d8e9bd714d074332646cd7': {},\n", + " 'def376dc6955b339b17f0a4b840e80eb6b9c744b': {},\n", + " '5f2c562056f8ffa89aeea0631f2a52300ee0de17': {},\n", + " '4b8432c7d56121c84d6ef1d75a1c7185c628c13d': {},\n", + " '4486f5c623705c6a14d9eeaba7d155cff30cdb43': {},\n", + " '50b790037030d958b662085c3f4cf34ba72a32ec': {},\n", + " '36220a1c5c2d6954f4873a552544cc0e55b61d0a': {},\n", + " 'e941759874365cb152a3562c22d10847d10db326': {},\n", + " '2d38d3313229fdc5f8aa00052a2db21b35be3d2d': {},\n", + " '1ce31fc9e2805034578eff60a269c02176f03252': {},\n", + " '2c631a4b61d529ff1c0635750888f6f6d79c2703': {},\n", + " '12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816': {},\n", + " '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1': {},\n", + " 'a9680a9e80e2c119d4aa41f90e5f40cef6e7df02': {},\n", + " '1fd937e6c0ff8452f64152aec9632f0586f981d1': {},\n", + " '50e92c677f4cd547c32bb9305269a762a9595074': {},\n", + " '8732e7d4739a911e01f69d2c5bda82ca15ca68a9': {},\n", + " '536e8833f3be4bbcffbfba43cd0b3043c5bf4068': {},\n", + " 'c90cc62006b35061c8db4b6f8da1b86de7650b23': {},\n", + " '1a3be9a5d01a414854ff3bfacd5257c14adeefa2': {}}}" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vars(G)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 00000000..a94cf69e --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1,2 @@ +requests +pandas \ No newline at end of file diff --git a/notebooks/track_releases/0.0-track-releases-by-commit.ipynb b/notebooks/track_releases/0.0-track-releases-by-commit.ipynb new file mode 100644 index 00000000..4a6bb51c --- /dev/null +++ b/notebooks/track_releases/0.0-track-releases-by-commit.ipynb @@ -0,0 +1,1443 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Release Tracking\n", + "Track releases by commit.\n", + "\n", + "## Sequence\n", + "- Create source config file if it is not available ==> CreateSourceConfig\n", + " - Get all commits and store shas, dates, urls\n", + " - Enrich commits with release version\n", + "- GetNewCommits\n", + " - Get all commits since last release\n", + " - Enrich commits with release version\n", + " - Update source config\n", + "- FindUpdatedReleases\n", + " - Parse new commits where at least one of the tracked files has been changed\n", + " - output the release version and the commit sha\n", + "- Update the source config state with the new commits\n", + "\n", + "### Notes\n", + "- All logic should rely on the source config file which is updated using the GitHub API in GetNewCommits" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'NoSuchKey' from 'botocore.exceptions' (/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/botocore/exceptions.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mre\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mjson\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbotocore\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mexceptions\u001b[39;00m \u001b[39mimport\u001b[39;00m NoSuchKey\n\u001b[1;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 11\u001b[0m ExecutionHistoryItem\n\u001b[1;32m 12\u001b[0m )\n\u001b[1;32m 13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 14\u001b[0m read_source_config,\n\u001b[1;32m 15\u001b[0m get_commits_for_asset,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 18\u001b[0m flatten_json,\n\u001b[1;32m 19\u001b[0m )\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'NoSuchKey' from 'botocore.exceptions' (/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/botocore/exceptions.py)" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')\n", + "import logging\n", + "from datetime import datetime\n", + "utc_now = datetime.utcnow().strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", + "import re\n", + "import json\n", + "from botocore.exceptions import ClientError\n", + "from src.utils.types import (\n", + " ExecutionHistoryItem\n", + ")\n", + "from src.utils import (\n", + " read_source_config,\n", + " get_commits_for_asset,\n", + " get_repo_asset,\n", + " get_commits,\n", + " flatten_json,\n", + ")\n", + "\n", + "# logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n", + "AWS_REGION = os.environ[\"AWS_REGION\"]\n", + "DATA_BUCKET_NAME = os.environ[\"DATA_BUCKET_NAME\"]\n", + "PIPELINE_CONFIG_S3_PATH = os.environ[\"PIPELINE_CONFIG_S3_PATH\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### App State" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:src.utils.utils:Failed to read config file to s3://dev-gfe-db-531868584498-us-east-1/config/pipeline/source-config.json\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'ClientError' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNoSuchKey\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 3\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> 3\u001b[0m source_config \u001b[39m=\u001b[39m read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)\n\u001b[1;32m 4\u001b[0m \u001b[39mexcept\u001b[39;00m ClientError \u001b[39mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/src/utils/utils.py:83\u001b[0m, in \u001b[0;36mread_source_config\u001b[0;34m(bucket, key)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mread_source_config\u001b[39m(bucket, key):\n\u001b[0;32m---> 83\u001b[0m data \u001b[39m=\u001b[39m read_s3_json(bucket, key)\n\u001b[1;32m 84\u001b[0m \u001b[39mreturn\u001b[39;00m SourceConfig(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdata)\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/src/utils/utils.py:64\u001b[0m, in \u001b[0;36mread_s3_json\u001b[0;34m(bucket, key)\u001b[0m\n\u001b[1;32m 63\u001b[0m logger\u001b[39m.\u001b[39merror(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mFailed to read config file to s3://\u001b[39m\u001b[39m{\u001b[39;00mbucket\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 64\u001b[0m \u001b[39mraise\u001b[39;00m err\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/src/utils/utils.py:57\u001b[0m, in \u001b[0;36mread_s3_json\u001b[0;34m(bucket, key)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 57\u001b[0m response \u001b[39m=\u001b[39m s3\u001b[39m.\u001b[39;49mget_object(\n\u001b[1;32m 58\u001b[0m Bucket\u001b[39m=\u001b[39;49mbucket, \n\u001b[1;32m 59\u001b[0m Key\u001b[39m=\u001b[39;49mkey)\n\u001b[1;32m 60\u001b[0m \u001b[39mreturn\u001b[39;00m json\u001b[39m.\u001b[39mloads(response[\u001b[39m\"\u001b[39m\u001b[39mBody\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39mdecode())\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/botocore/client.py:357\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[39m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 357\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_api_call(operation_name, kwargs)\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/botocore/client.py:676\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 675\u001b[0m error_class \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mexceptions\u001b[39m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 676\u001b[0m \u001b[39mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 677\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "\u001b[0;31mNoSuchKey\u001b[0m: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 3\u001b[0m source_config \u001b[39m=\u001b[39m read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)\n\u001b[0;32m----> 4\u001b[0m \u001b[39mexcept\u001b[39;00m ClientError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 5\u001b[0m logger\u001b[39m.\u001b[39merror(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCould not read source config file from S3: \u001b[39m\u001b[39m{\u001b[39;00me\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[39mraise\u001b[39;00m e\n", + "\u001b[0;31mNameError\u001b[0m: name 'ClientError' is not defined" + ] + } + ], + "source": [ + "# source config file in S3 must be up to date\n", + "try:\n", + " source_config = read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)\n", + "except ClientError as e:\n", + " logger.error(f\"Could not read source config file from S3: {e}\")\n", + " raise e" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Asset Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RepositoryConfig(owner='ANHIG', name='IMGTHLA', url='https://github.com/ANHIG/IMGTHLA', tracked_assets=['hla.dat', 'msf/'], default_input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), execution_history=[ExecutionHistoryItem(version=3480, date_utc='2022-04-14T11:00:42Z', commit=CommitDetails(sha='ecd63776c6225af0cf8bcc9fa9c6998d3129fb14', date_utc='2022-04-14T11:00:42Z', url='url'), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='SUCCESS'), ExecutionHistoryItem(version=3470, date_utc='2022-01-13T16:52:15Z', commit=CommitDetails(sha='06ceff14b2db920d458dc337b1100dced992e627', date_utc='2022-01-13T16:52:15Z', url='url'), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='SUCCESS')])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_repo = source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME]\n", + "source_repo" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ExecutionHistoryItem(version=3520, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING')" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get most recent item in source_repo.execution_history by source_repo.execution_history[].date_utc \n", + "last_processed_commit = max(source_repo.execution_history, key=lambda x: x.date_utc)\n", + "last_processed_commit" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "asset_commits = []\n", + "for asset in source_repo.tracked_assets:\n", + " commits = get_commits_for_asset(\n", + " owner=GITHUB_REPOSITORY_OWNER,\n", + " repo=GITHUB_REPOSITORY_NAME,\n", + " path=asset,\n", + " since=last_processed_commit.date_utc\n", + " )\n", + "\n", + " if not commits:\n", + " logger.warning(f\"No commits found for asset {asset} with sha {last_processed_commit.commit_sha} since {last_processed_commit.date}\")\n", + " else: \n", + " asset_commits.extend(commits)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'sha': 'def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'node_id': 'C_kwDOAXZz6NoAKGRlZjM3NmRjNjk1NWIzMzliMTdmMGE0Yjg0MGU4MGViNmI5Yzc0NGI',\n", + " 'commit': {'author': {'name': 'xeniageorgiouAN',\n", + " 'email': 'xenia.georgiou@anthonynolan.org',\n", + " 'date': '2023-04-17T16:01:01Z'},\n", + " 'committer': {'name': 'xeniageorgiouAN',\n", + " 'email': 'xenia.georgiou@anthonynolan.org',\n", + " 'date': '2023-04-17T16:01:01Z'},\n", + " 'message': 'IPD-IMGT/HLA Release 3.52.0',\n", + " 'tree': {'sha': 'e9ffca9666e355b1285a0c6a42951f6a28ea7f90',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/e9ffca9666e355b1285a0c6a42951f6a28ea7f90'},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'comment_count': 0,\n", + " 'verification': {'verified': False,\n", + " 'reason': 'unsigned',\n", + " 'signature': None,\n", + " 'payload': None}},\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b',\n", + " 'comments_url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/def376dc6955b339b17f0a4b840e80eb6b9c744b/comments',\n", + " 'author': {'login': 'xeniageorgiouAN',\n", + " 'id': 55495460,\n", + " 'node_id': 'MDQ6VXNlcjU1NDk1NDYw',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/55495460?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/xeniageorgiouAN',\n", + " 'html_url': 'https://github.com/xeniageorgiouAN',\n", + " 'followers_url': 'https://api.github.com/users/xeniageorgiouAN/followers',\n", + " 'following_url': 'https://api.github.com/users/xeniageorgiouAN/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/xeniageorgiouAN/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/xeniageorgiouAN/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/xeniageorgiouAN/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/xeniageorgiouAN/orgs',\n", + " 'repos_url': 'https://api.github.com/users/xeniageorgiouAN/repos',\n", + " 'events_url': 'https://api.github.com/users/xeniageorgiouAN/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/xeniageorgiouAN/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'committer': {'login': 'xeniageorgiouAN',\n", + " 'id': 55495460,\n", + " 'node_id': 'MDQ6VXNlcjU1NDk1NDYw',\n", + " 'avatar_url': 'https://avatars.githubusercontent.com/u/55495460?v=4',\n", + " 'gravatar_id': '',\n", + " 'url': 'https://api.github.com/users/xeniageorgiouAN',\n", + " 'html_url': 'https://github.com/xeniageorgiouAN',\n", + " 'followers_url': 'https://api.github.com/users/xeniageorgiouAN/followers',\n", + " 'following_url': 'https://api.github.com/users/xeniageorgiouAN/following{/other_user}',\n", + " 'gists_url': 'https://api.github.com/users/xeniageorgiouAN/gists{/gist_id}',\n", + " 'starred_url': 'https://api.github.com/users/xeniageorgiouAN/starred{/owner}{/repo}',\n", + " 'subscriptions_url': 'https://api.github.com/users/xeniageorgiouAN/subscriptions',\n", + " 'organizations_url': 'https://api.github.com/users/xeniageorgiouAN/orgs',\n", + " 'repos_url': 'https://api.github.com/users/xeniageorgiouAN/repos',\n", + " 'events_url': 'https://api.github.com/users/xeniageorgiouAN/events{/privacy}',\n", + " 'received_events_url': 'https://api.github.com/users/xeniageorgiouAN/received_events',\n", + " 'type': 'User',\n", + " 'site_admin': False},\n", + " 'parents': [{'sha': '5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/5f2c562056f8ffa89aeea0631f2a52300ee0de17',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'}]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "asset_commits[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out commits before last processed commit\n", + "unique_shas = list(set([(item[\"sha\"], item[\"commit\"][\"author\"][\"date\"]) for item in asset_commits \\\n", + " if datetime.strptime(item[\"commit\"][\"author\"][\"date\"], \"%Y-%m-%dT%H:%M:%SZ\") > datetime.strptime(last_processed_commit.date_utc, \"%Y-%m-%dT%H:%M:%SZ\")]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),\n", + " ('4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " ('2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " ('2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),\n", + " ('8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " ('1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " ('72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z')]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_shas.sort(key=lambda x: datetime.strptime(x[1], \"%Y-%m-%dT%H:%M:%SZ\"), reverse=True)\n", + "unique_shas" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Getting release version for sha def376dc6955b339b17f0a4b840e80eb6b9c744b and date 2023-04-17T16:01:01Z\n", + "INFO:__main__:Getting release version for sha 4486f5c623705c6a14d9eeaba7d155cff30cdb43 and date 2023-01-12T14:36:43Z\n", + "INFO:__main__:Getting release version for sha 2d38d3313229fdc5f8aa00052a2db21b35be3d2d and date 2022-10-14T08:46:01Z\n", + "INFO:__main__:Getting release version for sha 2c631a4b61d529ff1c0635750888f6f6d79c2703 and date 2022-10-13T12:58:37Z\n", + "INFO:__main__:Getting release version for sha 8f80f24d49797595d8a18b8d4d1f59846fbf3fe1 and date 2022-08-25T15:43:12Z\n", + "INFO:__main__:Getting release version for sha 1a3be9a5d01a414854ff3bfacd5257c14adeefa2 and date 2022-07-14T13:40:17Z\n", + "INFO:__main__:Getting release version for sha 72a9e28a52c9629dd63dfad5f215cdc562e2fd7e and date 2022-07-14T12:28:41Z\n" + ] + } + ], + "source": [ + "# get the releases for each unique commit from Allelelist.txt\n", + "# can produce duplicate release versions if the same release is updated more than once\n", + "# makes the assumption that the release version branch is up to date for that release, since the build process targets the release version branch and not the specific commit sha\n", + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "release_versions = []\n", + "release_versions_dicts = []\n", + "for sha, date in unique_shas:\n", + " logger.info(f\"Getting release version for sha {sha} and date {date}\")\n", + " allele_list = get_repo_asset(\n", + " GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", sha\n", + " )\n", + " release_version = int(\n", + " re.search(release_version_re, allele_list).group(1).replace(\".\", \"\")\n", + " )\n", + "\n", + " # TODO use a dict instead of tuple and append directly to source_config\n", + " release_versions.append((release_version, sha, date))\n", + " release_versions_dicts.append({\n", + " release_version, \n", + " sha, \n", + " date\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Release versions by sha:\n", + "[[3520, \"def376dc6955b339b17f0a4b840e80eb6b9c744b\", \"2023-04-17T16:01:01Z\"], [3510, \"4486f5c623705c6a14d9eeaba7d155cff30cdb43\", \"2023-01-12T14:36:43Z\"], [3500, \"2d38d3313229fdc5f8aa00052a2db21b35be3d2d\", \"2022-10-14T08:46:01Z\"], [3500, \"2c631a4b61d529ff1c0635750888f6f6d79c2703\", \"2022-10-13T12:58:37Z\"], [3490, \"8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\", \"2022-08-25T15:43:12Z\"], [3490, \"1a3be9a5d01a414854ff3bfacd5257c14adeefa2\", \"2022-07-14T13:40:17Z\"], [3480, \"72a9e28a52c9629dd63dfad5f215cdc562e2fd7e\", \"2022-07-14T12:28:41Z\"]]\n" + ] + } + ], + "source": [ + "# write this to source config file (must be sorted by data descending), but the status should be labeled firsts\n", + "release_versions.sort(key=lambda x: x[2], reverse=True)\n", + "logger.info(f\"Release versions by sha:\\n{json.dumps(release_versions)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found unique releases [[3480, \"72a9e28a52c9629dd63dfad5f215cdc562e2fd7e\"], [3490, \"1a3be9a5d01a414854ff3bfacd5257c14adeefa2\"], [3490, \"8f80f24d49797595d8a18b8d4d1f59846fbf3fe1\"], [3500, \"2c631a4b61d529ff1c0635750888f6f6d79c2703\"], [3500, \"2d38d3313229fdc5f8aa00052a2db21b35be3d2d\"], [3510, \"4486f5c623705c6a14d9eeaba7d155cff30cdb43\"], [3520, \"def376dc6955b339b17f0a4b840e80eb6b9c744b\"]]\n" + ] + } + ], + "source": [ + "# send this array to state machine\n", + "unique_release_versions = list(set([(version[0], version[1]) for version in release_versions]))\n", + "unique_release_versions.sort(reverse=False)\n", + "logger.info(f\"Found unique releases {json.dumps(unique_release_versions)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# take the most recent release version and get the commit sha for that release version\n", + "# this is the commit sha that will be used to build the release version\n", + "commits_pending_build = []\n", + "for version, sha in unique_release_versions:\n", + " items_for_version = [item for item in release_versions if item[0] == version]\n", + " most_recent_item = max(items_for_version, key=lambda x: x[2])\n", + " commits_pending_build.append(most_recent_item)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z')]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_pending_build" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z')]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_pending_build" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# the commits that aren't in commits_pending are labeled as skipped\n", + "commits_skipped = set(release_versions).difference(set(commits_pending_build))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{(3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z')}" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits_skipped" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history_pending = [\n", + " ExecutionHistoryItem(\n", + " **{\n", + " \"version\": item[0],\n", + " \"date_utc\": utc_now, # TODO switch to Lambda context variable\n", + " \"commit\": {\n", + " \"sha\": item[1],\n", + " \"date_utc\": item[2],\n", + " \"url\": \"\"\n", + " },\n", + " \"status\": \"PENDING\",\n", + " \"input_parameters\": source_repo.default_input_parameters.dict(),\n", + " }\n", + " )\n", + " for item in commits_pending_build\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history_skipped = [\n", + " ExecutionHistoryItem(\n", + " **{\n", + " \"version\": item[0],\n", + " \"date_utc\": utc_now, # TODO switch to Lambda context variable\n", + " \"commit\": {\n", + " \"sha\": item[1],\n", + " \"date_utc\": item[2],\n", + " \"url\": \"\"\n", + " },\n", + " \"status\": \"SKIPPED\"\n", + " }\n", + " )\n", + " for item in commits_skipped\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "execution_history = execution_history_pending + execution_history_skipped" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ExecutionHistoryItem(version=3520, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3510, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='4486f5c623705c6a14d9eeaba7d155cff30cdb43', date_utc='2023-01-12T14:36:43Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3500, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='2d38d3313229fdc5f8aa00052a2db21b35be3d2d', date_utc='2022-10-14T08:46:01Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3500, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='2c631a4b61d529ff1c0635750888f6f6d79c2703', date_utc='2022-10-13T12:58:37Z', url=''), input_parameters=None, status='SKIPPED'),\n", + " ExecutionHistoryItem(version=3490, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', date_utc='2022-08-25T15:43:12Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING'),\n", + " ExecutionHistoryItem(version=3490, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='1a3be9a5d01a414854ff3bfacd5257c14adeefa2', date_utc='2022-07-14T13:40:17Z', url=''), input_parameters=None, status='SKIPPED'),\n", + " ExecutionHistoryItem(version=3480, date_utc='2023-04-24T21:34:36Z', commit=CommitDetails(sha='72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', date_utc='2022-07-14T12:28:41Z', url=''), input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), status='PENDING')]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort by commit date descending\n", + "execution_history.sort(key=lambda x: x.commit.date_utc, reverse=True)\n", + "execution_history" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# warning: extend method is not idempotent, so if you run this cell more than once, you will get duplicate commits\n", + "source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history.extend(execution_history)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# sort by execution date descending\n", + "source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history.sort(key=lambda x: x.date_utc, reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# print(json.dumps([item.dict() for item in source_config.repositories[GITHUB_REPOSITORY_OWNER + \"/\" + GITHUB_REPOSITORY_NAME].execution_history], indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e'),\n", + " (3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2'),\n", + " (3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b')]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# send these to the state machine\n", + "unique_release_versions" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# write to json file\n", + "with open(\"source-config.json\", \"w\") as f:\n", + " json.dump(source_config.dict(), f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3490, '8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),\n", + " (3500, '2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),\n", + " (3480, '72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),\n", + " (3490, '1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),\n", + " (3510, '4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z'),\n", + " (3520, 'def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),\n", + " (3500, '2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z')]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", '06ceff14b2db920d458dc337b1100dced992e627')\n", + "release_version = int(re.search(release_version_re, allele_list).group(1).replace(\".\", \"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3470" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "release_version" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# sort commits by most recent date on [].commit.author.date\n", + "asset_commits.sort(key=lambda x: x[\"commit\"][\"author\"][\"date\"], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "all_commits = get_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "select_keys = [\"sha\", \"commit\"]\n", + "\n", + "# filter by select_keys\n", + "all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[[\"sha\", \"commit.author.date\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shacommit.author.date
038398a75e9762ff070d8e9bd714d074332646cd72023-04-17T16:03:52Z
1def376dc6955b339b17f0a4b840e80eb6b9c744b2023-04-17T16:01:01Z
25f2c562056f8ffa89aeea0631f2a52300ee0de172023-01-13T10:04:48Z
34b8432c7d56121c84d6ef1d75a1c7185c628c13d2023-01-12T14:47:00Z
44486f5c623705c6a14d9eeaba7d155cff30cdb432023-01-12T14:36:43Z
550b790037030d958b662085c3f4cf34ba72a32ec2022-12-14T10:02:54Z
636220a1c5c2d6954f4873a552544cc0e55b61d0a2022-12-14T10:02:28Z
7e941759874365cb152a3562c22d10847d10db3262022-10-14T08:47:37Z
82d38d3313229fdc5f8aa00052a2db21b35be3d2d2022-10-14T08:46:01Z
91ce31fc9e2805034578eff60a269c02176f032522022-10-13T13:06:12Z
102c631a4b61d529ff1c0635750888f6f6d79c27032022-10-13T12:58:37Z
1112b4b94bb49b4640fa48a1182cc9b1e1fbf7e8162022-08-25T15:44:07Z
128f80f24d49797595d8a18b8d4d1f59846fbf3fe12022-08-25T15:43:12Z
13a9680a9e80e2c119d4aa41f90e5f40cef6e7df022022-08-21T09:31:21Z
141fd937e6c0ff8452f64152aec9632f0586f981d12022-08-21T09:29:01Z
1550e92c677f4cd547c32bb9305269a762a95950742022-07-23T12:54:16Z
168732e7d4739a911e01f69d2c5bda82ca15ca68a92022-07-23T12:53:45Z
17536e8833f3be4bbcffbfba43cd0b3043c5bf40682022-07-15T08:21:31Z
18c90cc62006b35061c8db4b6f8da1b86de7650b232022-07-15T08:20:46Z
191a3be9a5d01a414854ff3bfacd5257c14adeefa22022-07-14T13:40:17Z
204863b8f0a070d70836dfbdc00bdf70aa7bb663452022-07-14T12:31:36Z
2172a9e28a52c9629dd63dfad5f215cdc562e2fd7e2022-07-14T12:28:41Z
22dddc6afc29de895f4131693e17b3cc856ff23f8d2022-07-14T12:27:25Z
23d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe2022-04-20T08:39:00Z
244d33035dd296826bbde200e17e5281910ac8be132022-04-20T08:37:44Z
254a13df461c42f970a099de77377f309995e7995c2022-04-19T08:30:24Z
265e1d9764c8e96749bc11d52807eea1f7cc38ce5c2022-04-14T11:01:59Z
27ecd63776c6225af0cf8bcc9fa9c6998d3129fb142022-04-14T11:00:42Z
28cada41a6bfac5a8bf88ed2107a0b856b9b9785a02022-03-15T14:27:59Z
29db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d2022-03-15T14:27:31Z
30207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e2022-02-10T15:36:59Z
318ab272288e81fa9a49fd4765579b066c2c03cc102022-02-10T15:26:19Z
32d4c410cd91618a8a74a7763598e499f6a8aa168b2022-01-14T15:25:23Z
33db46d2a0209720c5fa0ab8b03403f7009f69d2d62022-01-14T12:27:52Z
34ebb3d8c6aa383567bb7794cacf745b1c82efbb7f2022-01-14T12:27:35Z
35fc712c5a4dd00f534e845982a29f46a14e22b2922022-01-14T08:05:58Z
36239bf68f403bbdbabb43fa0277040f4dda2b2aff2022-01-13T20:33:38Z
37d86bdd9983424c792691269fa10e6f022ccf21912022-01-13T16:53:03Z
3806ceff14b2db920d458dc337b1100dced992e6272022-01-13T16:52:15Z
393fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd2022-01-04T09:19:02Z
400a051a92e0baab68b4c77ad13b8cb4e7eaa2f8102022-01-04T09:18:33Z
41d76ca7d75891aac95205b2cc11510ce8ef41a65a2021-11-11T11:49:26Z
42178c68cca2044cbd5032cbff23e4c06b738aed2a2021-11-11T11:48:57Z
432d19daf42ff47c2f2c4e7970dd77ed32768eb38f2021-10-26T11:32:30Z
443f46491c0fe377fd922412c93137efb6a4879b052021-10-26T11:15:21Z
458dd2a04b1d7e559f5124d012f0a729d0180216d22021-10-11T15:33:46Z
46ddda708da9f2f41ae7e7d122973c2eddb141d9492021-10-11T15:29:53Z
478d77b3dd93959663d58ae5b626289d0746edd0e72021-10-11T15:13:57Z
48d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea2021-10-11T14:48:05Z
49389b55b1c3ef58ea1136fb0c9e6d2ce73038d6552021-10-11T13:16:09Z
505088650c144e6a3ee05ac1e015d487a86095e6652021-10-11T12:53:06Z
517dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd2021-09-17T15:49:56Z
52352813601073bcdc3d5eb08d1be07be904276a9a2021-09-17T15:49:21Z
536c613ce701353f42b6c911801f0aeb78413fac9d2021-08-20T15:25:29Z
548f892b2675dd6464f526d32f1ad1d3fd62fa3b032021-08-20T14:12:35Z
55a231c3b065a7b9c86c92915f3b4bcf012ec361752021-07-20T11:13:25Z
561a3007e433ed7ccffc4a2c7c36ed8013d6105d4d2021-07-20T10:34:46Z
574a8d93e8607e2413edee71da68cade6e11249fea2021-07-19T11:27:37Z
58d3fd19b04d4256600e8c90a27a2b317d61514c862021-07-19T11:23:30Z
59fd63f824a11eead1f7fb39457251c7ac27ad1f932021-07-15T08:07:22Z
6089b57278869c7b46fbbaf94825ed336328f3567a2021-07-15T07:59:10Z
616fdf7add8bb4f416bd8660e6b826d80e8759dfe12021-07-14T06:19:03Z
62b57de92d58da2bf787e5af34ab09e0e8ee6cea6c2021-07-13T16:04:39Z
634cdd02827d4b0d2d8834e45baaea621663e0d6322021-07-13T16:03:03Z
64e6a3f0cc5710495475e7b959ee82c649978843f82021-07-13T16:01:36Z
65487c16cf1fb4ecb7eca25d752cb7178f10232a812021-07-13T15:56:41Z
663d7382f653a9707c3b2d683e7d58b7b9d0b4e7472021-07-13T15:55:05Z
67d155ba8553d2c53580ef66fbf733484dc2857a302021-07-13T15:52:43Z
68af805f6d40beef639fac7bd1c5f7789893c2f6622021-06-15T10:31:53Z
69ee9eefb21f878da18c72bdc04b31d6ec5d8ade062021-06-15T10:26:05Z
70a736a7ce4d98a0fefa3f4c01ab6da4641de742542021-06-11T12:09:09Z
712f6878301950dfea48a8147a6d8bd012be5640632021-06-11T12:09:02Z
72c0510fbaa138ba0029d8aa2949c282fb2dfbf9262021-06-11T10:16:30Z
734871325c65b16d5fc1a0fb406ebf539c3cbcf0542021-05-12T10:36:19Z
74cf8ed2bb499fae21b7ed1544177ef3589a8246b12021-05-12T10:34:43Z
75b9d97ef45c56394528534bd1a2ec7d2d785233542021-04-26T14:25:47Z
768da1eca28db8f6ae0db7e7134d334a10eb96f6a32021-04-21T11:17:00Z
77a2726dfb0cde8a5d567ce9a115d4f254f2af38512021-04-21T11:16:49Z
78410e5b8bd76c20bcde2cd0e92c78713e00160c282021-04-21T10:36:48Z
794240ef4cf17e503c44b177ff0f7b740258ac2e702021-04-12T15:17:44Z
807b746d5303493fb53214ceffbd9b365c8ef4a1262021-04-01T10:04:53Z
8194cfe2ac15af0171f1397984ca0d1968a6931eb32021-03-03T08:53:38Z
82c37267e1ee355c619a3058da5a7135e19e6522de2021-03-03T08:44:58Z
83d531c3422f806634d35d611b0296b6a34815c4e72021-02-26T12:30:20Z
840fa36435691cac5651bee2ffe511f73ccd98c3672021-02-17T15:20:04Z
85a65fcca04e51ba8a9d48d2ee676be1de7ff017462021-02-17T14:31:37Z
863182e0793150de4ffb34da2774991cb24e49a1912021-02-04T13:05:18Z
875df8b7ecec0e249df5485dd622efed8b357585402021-02-04T11:29:37Z
880fe7efb1a6217accb8d53cbb17a633f406459fbb2021-02-03T16:38:34Z
892fdc68adf24ec02e28fba5657c7f31c1373491d22021-02-03T16:11:39Z
90cf7eb4a9781fbdac6dba9ff7b345db2d88748eea2021-02-03T16:10:25Z
916305b6c83d61553d3606905f7b41a0cb99f92f0d2021-01-19T16:57:57Z
9225c04ea1f8bc4a9915417998310a563bdbb481152021-01-19T16:56:43Z
93cee977d8d1ca2d1f03f384e53227d56e5143b6fb2021-01-19T11:56:18Z
9403ded64be9b290287ab0101891ca6f338861c2fe2021-01-19T11:55:23Z
954ec1022d2645f3c2e5018971473710d568078eae2021-01-18T16:31:24Z
967a7d58ec828e8295dd4011f4f460f961a2b4428a2021-01-18T15:04:52Z
9767c8be545d4483ac34f86fdd57f09135a4820be62021-01-18T15:02:25Z
98441eeff3a9fca4934fc9cf54e2e4a914bb43ef102021-01-18T14:51:08Z
99a02389393dbd06e2e936d7d2438ef7e48fb969d92020-12-16T11:39:46Z
\n", + "
" + ], + "text/plain": [ + " sha commit.author.date\n", + "0 38398a75e9762ff070d8e9bd714d074332646cd7 2023-04-17T16:03:52Z\n", + "1 def376dc6955b339b17f0a4b840e80eb6b9c744b 2023-04-17T16:01:01Z\n", + "2 5f2c562056f8ffa89aeea0631f2a52300ee0de17 2023-01-13T10:04:48Z\n", + "3 4b8432c7d56121c84d6ef1d75a1c7185c628c13d 2023-01-12T14:47:00Z\n", + "4 4486f5c623705c6a14d9eeaba7d155cff30cdb43 2023-01-12T14:36:43Z\n", + "5 50b790037030d958b662085c3f4cf34ba72a32ec 2022-12-14T10:02:54Z\n", + "6 36220a1c5c2d6954f4873a552544cc0e55b61d0a 2022-12-14T10:02:28Z\n", + "7 e941759874365cb152a3562c22d10847d10db326 2022-10-14T08:47:37Z\n", + "8 2d38d3313229fdc5f8aa00052a2db21b35be3d2d 2022-10-14T08:46:01Z\n", + "9 1ce31fc9e2805034578eff60a269c02176f03252 2022-10-13T13:06:12Z\n", + "10 2c631a4b61d529ff1c0635750888f6f6d79c2703 2022-10-13T12:58:37Z\n", + "11 12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816 2022-08-25T15:44:07Z\n", + "12 8f80f24d49797595d8a18b8d4d1f59846fbf3fe1 2022-08-25T15:43:12Z\n", + "13 a9680a9e80e2c119d4aa41f90e5f40cef6e7df02 2022-08-21T09:31:21Z\n", + "14 1fd937e6c0ff8452f64152aec9632f0586f981d1 2022-08-21T09:29:01Z\n", + "15 50e92c677f4cd547c32bb9305269a762a9595074 2022-07-23T12:54:16Z\n", + "16 8732e7d4739a911e01f69d2c5bda82ca15ca68a9 2022-07-23T12:53:45Z\n", + "17 536e8833f3be4bbcffbfba43cd0b3043c5bf4068 2022-07-15T08:21:31Z\n", + "18 c90cc62006b35061c8db4b6f8da1b86de7650b23 2022-07-15T08:20:46Z\n", + "19 1a3be9a5d01a414854ff3bfacd5257c14adeefa2 2022-07-14T13:40:17Z\n", + "20 4863b8f0a070d70836dfbdc00bdf70aa7bb66345 2022-07-14T12:31:36Z\n", + "21 72a9e28a52c9629dd63dfad5f215cdc562e2fd7e 2022-07-14T12:28:41Z\n", + "22 dddc6afc29de895f4131693e17b3cc856ff23f8d 2022-07-14T12:27:25Z\n", + "23 d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe 2022-04-20T08:39:00Z\n", + "24 4d33035dd296826bbde200e17e5281910ac8be13 2022-04-20T08:37:44Z\n", + "25 4a13df461c42f970a099de77377f309995e7995c 2022-04-19T08:30:24Z\n", + "26 5e1d9764c8e96749bc11d52807eea1f7cc38ce5c 2022-04-14T11:01:59Z\n", + "27 ecd63776c6225af0cf8bcc9fa9c6998d3129fb14 2022-04-14T11:00:42Z\n", + "28 cada41a6bfac5a8bf88ed2107a0b856b9b9785a0 2022-03-15T14:27:59Z\n", + "29 db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d 2022-03-15T14:27:31Z\n", + "30 207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e 2022-02-10T15:36:59Z\n", + "31 8ab272288e81fa9a49fd4765579b066c2c03cc10 2022-02-10T15:26:19Z\n", + "32 d4c410cd91618a8a74a7763598e499f6a8aa168b 2022-01-14T15:25:23Z\n", + "33 db46d2a0209720c5fa0ab8b03403f7009f69d2d6 2022-01-14T12:27:52Z\n", + "34 ebb3d8c6aa383567bb7794cacf745b1c82efbb7f 2022-01-14T12:27:35Z\n", + "35 fc712c5a4dd00f534e845982a29f46a14e22b292 2022-01-14T08:05:58Z\n", + "36 239bf68f403bbdbabb43fa0277040f4dda2b2aff 2022-01-13T20:33:38Z\n", + "37 d86bdd9983424c792691269fa10e6f022ccf2191 2022-01-13T16:53:03Z\n", + "38 06ceff14b2db920d458dc337b1100dced992e627 2022-01-13T16:52:15Z\n", + "39 3fb27698fe4ebbd5ec4a0e565d97a53fe48c7cdd 2022-01-04T09:19:02Z\n", + "40 0a051a92e0baab68b4c77ad13b8cb4e7eaa2f810 2022-01-04T09:18:33Z\n", + "41 d76ca7d75891aac95205b2cc11510ce8ef41a65a 2021-11-11T11:49:26Z\n", + "42 178c68cca2044cbd5032cbff23e4c06b738aed2a 2021-11-11T11:48:57Z\n", + "43 2d19daf42ff47c2f2c4e7970dd77ed32768eb38f 2021-10-26T11:32:30Z\n", + "44 3f46491c0fe377fd922412c93137efb6a4879b05 2021-10-26T11:15:21Z\n", + "45 8dd2a04b1d7e559f5124d012f0a729d0180216d2 2021-10-11T15:33:46Z\n", + "46 ddda708da9f2f41ae7e7d122973c2eddb141d949 2021-10-11T15:29:53Z\n", + "47 8d77b3dd93959663d58ae5b626289d0746edd0e7 2021-10-11T15:13:57Z\n", + "48 d53cfe39089bd2ce40cf0fa06167d0ae38e3e4ea 2021-10-11T14:48:05Z\n", + "49 389b55b1c3ef58ea1136fb0c9e6d2ce73038d655 2021-10-11T13:16:09Z\n", + "50 5088650c144e6a3ee05ac1e015d487a86095e665 2021-10-11T12:53:06Z\n", + "51 7dc4672795eaea35bd8c7e1fa2ec40cf948fc2fd 2021-09-17T15:49:56Z\n", + "52 352813601073bcdc3d5eb08d1be07be904276a9a 2021-09-17T15:49:21Z\n", + "53 6c613ce701353f42b6c911801f0aeb78413fac9d 2021-08-20T15:25:29Z\n", + "54 8f892b2675dd6464f526d32f1ad1d3fd62fa3b03 2021-08-20T14:12:35Z\n", + "55 a231c3b065a7b9c86c92915f3b4bcf012ec36175 2021-07-20T11:13:25Z\n", + "56 1a3007e433ed7ccffc4a2c7c36ed8013d6105d4d 2021-07-20T10:34:46Z\n", + "57 4a8d93e8607e2413edee71da68cade6e11249fea 2021-07-19T11:27:37Z\n", + "58 d3fd19b04d4256600e8c90a27a2b317d61514c86 2021-07-19T11:23:30Z\n", + "59 fd63f824a11eead1f7fb39457251c7ac27ad1f93 2021-07-15T08:07:22Z\n", + "60 89b57278869c7b46fbbaf94825ed336328f3567a 2021-07-15T07:59:10Z\n", + "61 6fdf7add8bb4f416bd8660e6b826d80e8759dfe1 2021-07-14T06:19:03Z\n", + "62 b57de92d58da2bf787e5af34ab09e0e8ee6cea6c 2021-07-13T16:04:39Z\n", + "63 4cdd02827d4b0d2d8834e45baaea621663e0d632 2021-07-13T16:03:03Z\n", + "64 e6a3f0cc5710495475e7b959ee82c649978843f8 2021-07-13T16:01:36Z\n", + "65 487c16cf1fb4ecb7eca25d752cb7178f10232a81 2021-07-13T15:56:41Z\n", + "66 3d7382f653a9707c3b2d683e7d58b7b9d0b4e747 2021-07-13T15:55:05Z\n", + "67 d155ba8553d2c53580ef66fbf733484dc2857a30 2021-07-13T15:52:43Z\n", + "68 af805f6d40beef639fac7bd1c5f7789893c2f662 2021-06-15T10:31:53Z\n", + "69 ee9eefb21f878da18c72bdc04b31d6ec5d8ade06 2021-06-15T10:26:05Z\n", + "70 a736a7ce4d98a0fefa3f4c01ab6da4641de74254 2021-06-11T12:09:09Z\n", + "71 2f6878301950dfea48a8147a6d8bd012be564063 2021-06-11T12:09:02Z\n", + "72 c0510fbaa138ba0029d8aa2949c282fb2dfbf926 2021-06-11T10:16:30Z\n", + "73 4871325c65b16d5fc1a0fb406ebf539c3cbcf054 2021-05-12T10:36:19Z\n", + "74 cf8ed2bb499fae21b7ed1544177ef3589a8246b1 2021-05-12T10:34:43Z\n", + "75 b9d97ef45c56394528534bd1a2ec7d2d78523354 2021-04-26T14:25:47Z\n", + "76 8da1eca28db8f6ae0db7e7134d334a10eb96f6a3 2021-04-21T11:17:00Z\n", + "77 a2726dfb0cde8a5d567ce9a115d4f254f2af3851 2021-04-21T11:16:49Z\n", + "78 410e5b8bd76c20bcde2cd0e92c78713e00160c28 2021-04-21T10:36:48Z\n", + "79 4240ef4cf17e503c44b177ff0f7b740258ac2e70 2021-04-12T15:17:44Z\n", + "80 7b746d5303493fb53214ceffbd9b365c8ef4a126 2021-04-01T10:04:53Z\n", + "81 94cfe2ac15af0171f1397984ca0d1968a6931eb3 2021-03-03T08:53:38Z\n", + "82 c37267e1ee355c619a3058da5a7135e19e6522de 2021-03-03T08:44:58Z\n", + "83 d531c3422f806634d35d611b0296b6a34815c4e7 2021-02-26T12:30:20Z\n", + "84 0fa36435691cac5651bee2ffe511f73ccd98c367 2021-02-17T15:20:04Z\n", + "85 a65fcca04e51ba8a9d48d2ee676be1de7ff01746 2021-02-17T14:31:37Z\n", + "86 3182e0793150de4ffb34da2774991cb24e49a191 2021-02-04T13:05:18Z\n", + "87 5df8b7ecec0e249df5485dd622efed8b35758540 2021-02-04T11:29:37Z\n", + "88 0fe7efb1a6217accb8d53cbb17a633f406459fbb 2021-02-03T16:38:34Z\n", + "89 2fdc68adf24ec02e28fba5657c7f31c1373491d2 2021-02-03T16:11:39Z\n", + "90 cf7eb4a9781fbdac6dba9ff7b345db2d88748eea 2021-02-03T16:10:25Z\n", + "91 6305b6c83d61553d3606905f7b41a0cb99f92f0d 2021-01-19T16:57:57Z\n", + "92 25c04ea1f8bc4a9915417998310a563bdbb48115 2021-01-19T16:56:43Z\n", + "93 cee977d8d1ca2d1f03f384e53227d56e5143b6fb 2021-01-19T11:56:18Z\n", + "94 03ded64be9b290287ab0101891ca6f338861c2fe 2021-01-19T11:55:23Z\n", + "95 4ec1022d2645f3c2e5018971473710d568078eae 2021-01-18T16:31:24Z\n", + "96 7a7d58ec828e8295dd4011f4f460f961a2b4428a 2021-01-18T15:04:52Z\n", + "97 67c8be545d4483ac34f86fdd57f09135a4820be6 2021-01-18T15:02:25Z\n", + "98 441eeff3a9fca4934fc9cf54e2e4a914bb43ef10 2021-01-18T14:51:08Z\n", + "99 a02389393dbd06e2e936d7d2438ef7e48fb969d9 2020-12-16T11:39:46Z" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/track_releases/1.0-build-source-config.ipynb b/notebooks/track_releases/1.0-build-source-config.ipynb new file mode 100644 index 00000000..aad09700 --- /dev/null +++ b/notebooks/track_releases/1.0-build-source-config.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build Source Config\n", + "```json\n", + "{\n", + " \"created_at_utc\": \"2022-04-14T11:00:42Z\",\n", + " \"updated_at_utc\": \"2023-04-25T01:50:12Z\",\n", + " \"repositories\": {\n", + " \"ANHIG/IMGTHLA\": {\n", + " \"owner\": \"ANHIG\",\n", + " \"name\": \"IMGTHLA\",\n", + " \"url\": \"https://github.com/ANHIG/IMGTHLA\",\n", + " \"tracked_assets\": [\n", + " \"hla.dat\",\n", + " \"msf/\"\n", + " ],\n", + " \"default_input_parameters\": {\n", + " \"align\": \"False\",\n", + " \"kir\": \"False\",\n", + " \"mem_profile\": \"False\",\n", + " \"limit\": \"1000\"\n", + " },\n", + " \"execution_history\": [\n", + " {\n", + " \"version\": 3480,\n", + " \"execution_date_utc\": \"2022-04-14T11:00:42Z\",\n", + " \"commit\": {\n", + " \"sha\": \"ecd63776c6225af0cf8bcc9fa9c6998d3129fb14\",\n", + " \"message\": \"\",\n", + " \"date_utc\": \"2022-04-14T11:00:42Z\",\n", + " \"html_url\": \"url\"\n", + " },\n", + " \"input_parameters\": {\n", + " \"align\": \"False\",\n", + " \"kir\": \"False\",\n", + " \"mem_profile\": \"False\",\n", + " \"limit\": \"1000\"\n", + " },\n", + " \"status\": \"SUCCESS\" \n", + " },\n", + " {\n", + " \"version\": 3470,\n", + " \"execution_date_utc\": \"2022-01-13T16:52:15Z\",\n", + " \"commit\": {\n", + " \"sha\": \"06ceff14b2db920d458dc337b1100dced992e627\",\n", + " \"message\": \"\",\n", + " \"date_utc\": \"2022-01-13T16:52:15Z\",\n", + " \"html_url\": \"url\"\n", + " },\n", + " \"input_parameters\": {\n", + " \"align\": \"False\",\n", + " \"kir\": \"False\",\n", + " \"mem_profile\": \"False\",\n", + " \"limit\": \"1000\"\n", + " },\n", + " \"status\": \"SUCCESS\"\n", + " }\n", + " ]\n", + " }\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'src'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/notebooks/track_releases/1.0-build-source-config.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 9\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbotocore\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mexceptions\u001b[39;00m \u001b[39mimport\u001b[39;00m ClientError\n\u001b[0;32m---> 11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 12\u001b[0m Commit,\n\u001b[1;32m 13\u001b[0m InputParameters,\n\u001b[1;32m 14\u001b[0m ExecutionHistoryItem,\n\u001b[1;32m 15\u001b[0m RepositoryConfig,\n\u001b[1;32m 16\u001b[0m SourceConfig\n\u001b[1;32m 17\u001b[0m )\n\u001b[1;32m 18\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 19\u001b[0m read_source_config,\n\u001b[1;32m 20\u001b[0m paginate_commits,\n\u001b[1;32m 21\u001b[0m flatten_json,\n\u001b[1;32m 22\u001b[0m get_repo_asset\n\u001b[1;32m 23\u001b[0m )\n\u001b[1;32m 25\u001b[0m \u001b[39m# logging\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')\n", + "import logging\n", + "from datetime import datetime\n", + "utc_now = datetime.utcnow().strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", + "import re\n", + "import json\n", + "import pandas as pd\n", + "from botocore.exceptions import ClientError\n", + "from src.utils.types import (\n", + " Commit,\n", + " InputParameters,\n", + " ExecutionHistoryItem,\n", + " RepositoryConfig,\n", + " SourceConfig\n", + ")\n", + "from src.utils import (\n", + " read_source_config,\n", + " paginate_commits,\n", + " flatten_json,\n", + " get_repo_asset\n", + ")\n", + "\n", + "# logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Pandas display options\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2023-04-25T02:35:07Z'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utc_now" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n", + "AWS_REGION = os.environ[\"AWS_REGION\"]\n", + "DATA_BUCKET_NAME = os.environ[\"DATA_BUCKET_NAME\"]\n", + "PIPELINE_CONFIG_S3_PATH = os.environ[\"PIPELINE_CONFIG_S3_PATH\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### App State" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:src.utils.utils:Failed to read config file to s3://dev-gfe-db-531868584498-us-east-1/config/pipeline/source-config.json\n", + "INFO:__main__:'source-config.json' not found. Building source config from repository.\n" + ] + } + ], + "source": [ + "# source config file in S3 must be up to date\n", + "try:\n", + " source_config = read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)\n", + "except ClientError as e:\n", + " logger.info(\"'source-config.json' not found. Building source config from repository.\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Commits" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch all commits from repo using GitHub API\n", + "all_commits = paginate_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "select_keys = [\"sha\", \"commit\", \"html_url\"]\n", + "\n", + "# filter by select_keys\n", + "all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# all_commits[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# # For reference only, not using pandas in deployment\n", + "# select_cols = [\"sha\", \"commit.author.date\", \"commit.message\", \"html_url\"]\n", + "# df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[select_cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "all_commits_flat = [flatten_json(commit) for commit in all_commits]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "select_keys = [\"sha\", \"commit.author.date\", \"commit.message\", \"html_url\"]\n", + "all_commits_flat = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits_flat]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# rename keys\n", + "rename_keys = {\n", + " \"sha\": \"sha\", \n", + " \"commit.author.date\": \"date_utc\", \n", + " \"commit.message\": \"message\", \n", + " \"html_url\": \"html_url\"\n", + "}\n", + "\n", + "# Rename\n", + "commits = [{rename_keys[k]: v for k, v in x.items()} for x in all_commits_flat]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'sha': '38398a75e9762ff070d8e9bd714d074332646cd7',\n", + " 'date_utc': '2023-04-17T16:03:52Z',\n", + " 'message': 'Merge pull request #334 from ANHIG/3520\\n\\nIPD-IMGT/HLA Release 3.52.0',\n", + " 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execution History Items" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[14], line 12\u001b[0m\n\u001b[1;32m 10\u001b[0m date \u001b[39m=\u001b[39m commit[\u001b[39m'\u001b[39m\u001b[39mdate_utc\u001b[39m\u001b[39m'\u001b[39m]\n\u001b[1;32m 11\u001b[0m logger\u001b[39m.\u001b[39mdebug(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mGetting release version for sha \u001b[39m\u001b[39m{\u001b[39;00msha\u001b[39m}\u001b[39;00m\u001b[39m and date \u001b[39m\u001b[39m{\u001b[39;00mdate\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m---> 12\u001b[0m allele_list \u001b[39m=\u001b[39m get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \u001b[39m\"\u001b[39;49m\u001b[39mAllelelist.txt\u001b[39;49m\u001b[39m\"\u001b[39;49m, sha)\n\u001b[1;32m 13\u001b[0m release_version \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(re\u001b[39m.\u001b[39msearch(release_version_re, allele_list)\u001b[39m.\u001b[39mgroup(\u001b[39m1\u001b[39m)\u001b[39m.\u001b[39mreplace(\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m))\n\u001b[1;32m 15\u001b[0m execution_history_items\u001b[39m.\u001b[39mappend({\n\u001b[1;32m 16\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mversion\u001b[39m\u001b[39m\"\u001b[39m: release_version,\n\u001b[1;32m 17\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mexecution_date_utc\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mstatus\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 21\u001b[0m })\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/src/utils/utils.py:235\u001b[0m, in \u001b[0;36mget_repo_asset\u001b[0;34m(owner, repo, path, commit_sha)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_repo_asset\u001b[39m(owner, repo, path, commit_sha\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 234\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Download a file from a GitHub repository\"\"\"\u001b[39;00m\n\u001b[0;32m--> 235\u001b[0m repo_contents \u001b[39m=\u001b[39m get_repo_contents(owner, repo, path, commit_sha)\n\u001b[1;32m 237\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 238\u001b[0m res \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(repo_contents[\u001b[39m'\u001b[39m\u001b[39mdownload_url\u001b[39m\u001b[39m'\u001b[39m])\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/src/utils/utils.py:228\u001b[0m, in \u001b[0;36mget_repo_contents\u001b[0;34m(owner, repo, path, commit_sha)\u001b[0m\n\u001b[1;32m 217\u001b[0m headers \u001b[39m=\u001b[39m {\n\u001b[1;32m 218\u001b[0m \u001b[39m'\u001b[39m\u001b[39mAuthorization\u001b[39m\u001b[39m'\u001b[39m: \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtoken \u001b[39m\u001b[39m{\u001b[39;00mGITHUB_PERSONAL_ACCESS_TOKEN\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m,\n\u001b[1;32m 219\u001b[0m \u001b[39m'\u001b[39m\u001b[39mContent-Type\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mapplication/json\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[1;32m 220\u001b[0m \u001b[39m'\u001b[39m\u001b[39mAccept\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mapplication/vnd.github.v3+json\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[1;32m 221\u001b[0m \u001b[39m'\u001b[39m\u001b[39mX-GitHub-Api-Version\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39m2022-11-28\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 222\u001b[0m }\n\u001b[1;32m 224\u001b[0m params \u001b[39m=\u001b[39m {\n\u001b[1;32m 225\u001b[0m \u001b[39m'\u001b[39m\u001b[39mref\u001b[39m\u001b[39m'\u001b[39m: commit_sha\n\u001b[1;32m 226\u001b[0m }\n\u001b[0;32m--> 228\u001b[0m response \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39;49mget(url, headers\u001b[39m=\u001b[39;49mheaders, params\u001b[39m=\u001b[39;49mparams)\n\u001b[1;32m 230\u001b[0m \u001b[39mreturn\u001b[39;00m response\u001b[39m.\u001b[39mjson()\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/api.py:75\u001b[0m, in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[39m\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \n\u001b[1;32m 66\u001b[0m \u001b[39m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[39m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 72\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 74\u001b[0m kwargs\u001b[39m.\u001b[39msetdefault(\u001b[39m'\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m'\u001b[39m, \u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 75\u001b[0m \u001b[39mreturn\u001b[39;00m request(\u001b[39m'\u001b[39;49m\u001b[39mget\u001b[39;49m\u001b[39m'\u001b[39;49m, url, params\u001b[39m=\u001b[39;49mparams, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/api.py:60\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[39m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[39m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[39m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[39mwith\u001b[39;00m sessions\u001b[39m.\u001b[39mSession() \u001b[39mas\u001b[39;00m session:\n\u001b[0;32m---> 60\u001b[0m \u001b[39mreturn\u001b[39;00m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/sessions.py:533\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 528\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 529\u001b[0m \u001b[39m'\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m'\u001b[39m: timeout,\n\u001b[1;32m 530\u001b[0m \u001b[39m'\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m'\u001b[39m: allow_redirects,\n\u001b[1;32m 531\u001b[0m }\n\u001b[1;32m 532\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 533\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 535\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/sessions.py:686\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 683\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[1;32m 685\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n\u001b[0;32m--> 686\u001b[0m r\u001b[39m.\u001b[39;49mcontent\n\u001b[1;32m 688\u001b[0m \u001b[39mreturn\u001b[39;00m r\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/models.py:828\u001b[0m, in \u001b[0;36mResponse.content\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 826\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 827\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 828\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mb\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.\u001b[39;49mjoin(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miter_content(CONTENT_CHUNK_SIZE)) \u001b[39mor\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 830\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content_consumed \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 831\u001b[0m \u001b[39m# don't need to release the connection; that's been handled by urllib3\u001b[39;00m\n\u001b[1;32m 832\u001b[0m \u001b[39m# since we exhausted the data.\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/requests/models.py:750\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 748\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw, \u001b[39m'\u001b[39m\u001b[39mstream\u001b[39m\u001b[39m'\u001b[39m):\n\u001b[1;32m 749\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 750\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw\u001b[39m.\u001b[39mstream(chunk_size, decode_content\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 751\u001b[0m \u001b[39myield\u001b[39;00m chunk\n\u001b[1;32m 752\u001b[0m \u001b[39mexcept\u001b[39;00m ProtocolError \u001b[39mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/urllib3/response.py:492\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 477\u001b[0m \u001b[39mA generator wrapper for the read() method. A call will block until\u001b[39;00m\n\u001b[1;32m 478\u001b[0m \u001b[39m``amt`` bytes have been read from the connection or until the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[39m 'content-encoding' header.\u001b[39;00m\n\u001b[1;32m 490\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 491\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunked \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msupports_chunked_reads():\n\u001b[0;32m--> 492\u001b[0m \u001b[39mfor\u001b[39;00m line \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mread_chunked(amt, decode_content\u001b[39m=\u001b[39mdecode_content):\n\u001b[1;32m 493\u001b[0m \u001b[39myield\u001b[39;00m line\n\u001b[1;32m 494\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/urllib3/response.py:671\u001b[0m, in \u001b[0;36mHTTPResponse.read_chunked\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunk_left \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 670\u001b[0m \u001b[39mbreak\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m chunk \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_handle_chunk(amt)\n\u001b[1;32m 672\u001b[0m decoded \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decode(chunk, decode_content\u001b[39m=\u001b[39mdecode_content,\n\u001b[1;32m 673\u001b[0m flush_decoder\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 674\u001b[0m \u001b[39mif\u001b[39;00m decoded:\n", + "File \u001b[0;32m~/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.venv-dev/lib/python3.9/site-packages/urllib3/response.py:617\u001b[0m, in \u001b[0;36mHTTPResponse._handle_chunk\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 615\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunk_left \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 616\u001b[0m \u001b[39melif\u001b[39;00m amt \u001b[39m<\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunk_left:\n\u001b[0;32m--> 617\u001b[0m value \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fp\u001b[39m.\u001b[39;49m_safe_read(amt)\n\u001b[1;32m 618\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunk_left \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchunk_left \u001b[39m-\u001b[39m amt\n\u001b[1;32m 619\u001b[0m returned_chunk \u001b[39m=\u001b[39m value\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py:626\u001b[0m, in \u001b[0;36mHTTPResponse._safe_read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 624\u001b[0m s \u001b[39m=\u001b[39m []\n\u001b[1;32m 625\u001b[0m \u001b[39mwhile\u001b[39;00m amt \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 626\u001b[0m chunk \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mread(\u001b[39mmin\u001b[39;49m(amt, MAXAMOUNT))\n\u001b[1;32m 627\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m chunk:\n\u001b[1;32m 628\u001b[0m \u001b[39mraise\u001b[39;00m IncompleteRead(\u001b[39mb\u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mjoin(s), amt)\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py:704\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 703\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 704\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m 705\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m 706\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py:1242\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1238\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1239\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1240\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m 1241\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1242\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m 1243\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1244\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py:1100\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1099\u001b[0m \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1100\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m 1101\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1102\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# TODO use multithreading to speed up\n", + "# next we get the release version for each commit\n", + "release_version_re = r\"# version: IPD-IMGT/HLA (\\d+\\.\\d+\\.\\d+)\"\n", + "execution_history_items = []\n", + "errors = 0\n", + "max_errors = 5\n", + "limit = None\n", + "for idx, commit in enumerate(commits):\n", + " try:\n", + " sha = commit['sha']\n", + " date = commit['date_utc']\n", + " logger.debug(f\"Getting release version for sha {sha} and date {date}\")\n", + " allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Allelelist.txt\", sha)\n", + " release_version = int(re.search(release_version_re, allele_list).group(1).replace(\".\", \"\"))\n", + "\n", + " execution_history_items.append({\n", + " \"version\": release_version,\n", + " \"execution_date_utc\": None,\n", + " \"commit\": commit,\n", + " \"input_parameters\": None,\n", + " \"status\": None\n", + " })\n", + " except Exception as e:\n", + " errors += 1\n", + " logger.error(f\"Error processing commit {commit['sha']}: {e}\")\n", + " if errors >= max_errors:\n", + " logger.error(f\"Max errors reached. Exiting loop.\")\n", + " break\n", + "\n", + " if limit is not None:\n", + " if idx+1 == limit:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(execution_history_items)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RepositoryConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ANHIG/IMGTHLA'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "repository_path = f\"{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}\"\n", + "tracked_assets = [\"hla.dat\", \"msf/\"]\n", + "repository_path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "base_source_config = {\n", + " \"created_at_utc\": utc_now,\n", + " \"updated_at_utc\": utc_now,\n", + " \"repositories\": {\n", + " repository_path: {\n", + " \"owner\": GITHUB_REPOSITORY_OWNER,\n", + " \"name\": GITHUB_REPOSITORY_NAME,\n", + " \"url\": f\"https://github.com/{repository_path}\",\n", + " \"tracked_assets\": tracked_assets,\n", + " \"default_input_parameters\": {\n", + " \"align\": \"False\",\n", + " \"kir\": \"False\",\n", + " \"mem_profile\": \"False\",\n", + " \"limit\": \"1000\"\n", + " }\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'created_at_utc': '2023-04-25T02:35:07Z',\n", + " 'updated_at_utc': '2023-04-25T02:35:07Z',\n", + " 'repositories': {'ANHIG/IMGTHLA': {'owner': 'ANHIG',\n", + " 'name': 'IMGTHLA',\n", + " 'url': 'https://github.com/ANHIG/IMGTHLA',\n", + " 'tracked_assets': ['hla.dat', 'msf/'],\n", + " 'default_input_parameters': {'align': 'False',\n", + " 'kir': 'False',\n", + " 'mem_profile': 'False',\n", + " 'limit': '1000'}}}}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "base_source_config" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "base_source_config[\"repositories\"][repository_path][\"execution_history\"] = execution_history_items" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "source_config = SourceConfig(**base_source_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SourceConfig(created_at_utc='2023-04-25T02:35:07Z', updated_at_utc='2023-04-25T02:35:07Z', repositories={'ANHIG/IMGTHLA': RepositoryConfig(owner='ANHIG', name='IMGTHLA', url='https://github.com/ANHIG/IMGTHLA', tracked_assets=['hla.dat', 'msf/'], default_input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), execution_history=[ExecutionHistoryItem(version=3520, execution_date_utc=None, commit=Commit(sha='38398a75e9762ff070d8e9bd714d074332646cd7', date_utc='2023-04-17T16:03:52Z', message='Merge pull request #334 from ANHIG/3520\\n\\nIPD-IMGT/HLA Release 3.52.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7'), input_parameters=None, status=None), ExecutionHistoryItem(version=3520, execution_date_utc=None, commit=Commit(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', message='IPD-IMGT/HLA Release 3.52.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b'), input_parameters=None, status=None), ExecutionHistoryItem(version=3510, execution_date_utc=None, commit=Commit(sha='5f2c562056f8ffa89aeea0631f2a52300ee0de17', date_utc='2023-01-13T10:04:48Z', message='Merge pull request #325 from ANHIG/3510\\n\\n3510', html_url='https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17'), input_parameters=None, status=None), ExecutionHistoryItem(version=3510, execution_date_utc=None, commit=Commit(sha='4b8432c7d56121c84d6ef1d75a1c7185c628c13d', date_utc='2023-01-12T14:47:00Z', message='IPD-IMGT/HLA Database Release 3.51.0\\n\\nUpdated the ihiw folder for 3.51.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/4b8432c7d56121c84d6ef1d75a1c7185c628c13d'), input_parameters=None, status=None), ExecutionHistoryItem(version=3510, execution_date_utc=None, commit=Commit(sha='4486f5c623705c6a14d9eeaba7d155cff30cdb43', date_utc='2023-01-12T14:36:43Z', message='IPD-IMGT/HLA Release 3.51.0\\n\\nUpdates for 3.51.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/4486f5c623705c6a14d9eeaba7d155cff30cdb43'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='50b790037030d958b662085c3f4cf34ba72a32ec', date_utc='2022-12-14T10:02:54Z', message='Merge pull request #323 from ANHIG/3500\\n\\nCorrected missing date and version field in Allelelist_history.txt', html_url='https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='36220a1c5c2d6954f4873a552544cc0e55b61d0a', date_utc='2022-12-14T10:02:28Z', message='Corrected missing date and version field in Allelelist_history.txt', html_url='https://github.com/ANHIG/IMGTHLA/commit/36220a1c5c2d6954f4873a552544cc0e55b61d0a'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='e941759874365cb152a3562c22d10847d10db326', date_utc='2022-10-14T08:47:37Z', message='Merge pull request #318 from ANHIG/3500\\n\\nhla.dat file for IPD-IMGT/HLA Release 3.50.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/e941759874365cb152a3562c22d10847d10db326'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='2d38d3313229fdc5f8aa00052a2db21b35be3d2d', date_utc='2022-10-14T08:46:01Z', message='hla.dat file for IPD-IMGT/HLA Release 3.50.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='1ce31fc9e2805034578eff60a269c02176f03252', date_utc='2022-10-13T13:06:12Z', message='Merge pull request #317 from ANHIG/3500\\n\\nIPD-IMGT/HLA Release 3.50.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/1ce31fc9e2805034578eff60a269c02176f03252'), input_parameters=None, status=None), ExecutionHistoryItem(version=3500, execution_date_utc=None, commit=Commit(sha='2c631a4b61d529ff1c0635750888f6f6d79c2703', date_utc='2022-10-13T12:58:37Z', message='IPD-IMGT/HLA Release 3.50.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816', date_utc='2022-08-25T15:44:07Z', message='Merge pull request #315 from ANHIG/3490\\n\\nRemoval of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3…', html_url='https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', date_utc='2022-08-25T15:43:12Z', message='Removal of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3\\'UTR in DQB1*03:480Q', html_url='https://github.com/ANHIG/IMGTHLA/commit/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='a9680a9e80e2c119d4aa41f90e5f40cef6e7df02', date_utc='2022-08-21T09:31:21Z', message='Merge pull request #314 from ANHIG/3490\\n\\nCorrected DRA*01:01:01:01 P group (DRA*01:01P)', html_url='https://github.com/ANHIG/IMGTHLA/commit/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='1fd937e6c0ff8452f64152aec9632f0586f981d1', date_utc='2022-08-21T09:29:01Z', message='Corrected DRA*01:01:01:01 P group (DRA*01:01P)', html_url='https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='50e92c677f4cd547c32bb9305269a762a9595074', date_utc='2022-07-23T12:54:16Z', message='Merge pull request #310 from ANHIG/3490\\n\\nAdded additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt …', html_url='https://github.com/ANHIG/IMGTHLA/commit/50e92c677f4cd547c32bb9305269a762a9595074'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='8732e7d4739a911e01f69d2c5bda82ca15ca68a9', date_utc='2022-07-23T12:53:45Z', message='Added additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt due to DRB1*15:200:01:01N and DRB1*15:200:01:02N', html_url='https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='536e8833f3be4bbcffbfba43cd0b3043c5bf4068', date_utc='2022-07-15T08:21:31Z', message='Merge pull request #308 from ANHIG/3490\\n\\n3490', html_url='https://github.com/ANHIG/IMGTHLA/commit/536e8833f3be4bbcffbfba43cd0b3043c5bf4068'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='c90cc62006b35061c8db4b6f8da1b86de7650b23', date_utc='2022-07-15T08:20:46Z', message=\"Merge branch 'Latest' into 3490\", html_url='https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='1a3be9a5d01a414854ff3bfacd5257c14adeefa2', date_utc='2022-07-14T13:40:17Z', message='IPD-IMGT/HLA Release 3.49.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/1a3be9a5d01a414854ff3bfacd5257c14adeefa2'), input_parameters=None, status=None), ExecutionHistoryItem(version=3490, execution_date_utc=None, commit=Commit(sha='4863b8f0a070d70836dfbdc00bdf70aa7bb66345', date_utc='2022-07-14T12:31:36Z', message='IPD-IMGT/HLA Release 3.49.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/4863b8f0a070d70836dfbdc00bdf70aa7bb66345'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', date_utc='2022-07-14T12:28:41Z', message='IPD-IMGT/HLA Release 3.49.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/72a9e28a52c9629dd63dfad5f215cdc562e2fd7e'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='dddc6afc29de895f4131693e17b3cc856ff23f8d', date_utc='2022-07-14T12:27:25Z', message='IPD-IMGT/HLA Release 3.49.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/dddc6afc29de895f4131693e17b3cc856ff23f8d'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe', date_utc='2022-04-20T08:39:00Z', message='Merge pull request #305 from ANHIG/3480\\n\\nUpload of Allelelist_history.txt for IPD-IMGT/HLA Release 3.48.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='4d33035dd296826bbde200e17e5281910ac8be13', date_utc='2022-04-20T08:37:44Z', message='Upload of Allelelist_history.txt for IPD-IMGT/HLA Release 3.48.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/4d33035dd296826bbde200e17e5281910ac8be13'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='4a13df461c42f970a099de77377f309995e7995c', date_utc='2022-04-19T08:30:24Z', message=\"Merge branch '3480' into Latest\", html_url='https://github.com/ANHIG/IMGTHLA/commit/4a13df461c42f970a099de77377f309995e7995c'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='5e1d9764c8e96749bc11d52807eea1f7cc38ce5c', date_utc='2022-04-14T11:01:59Z', message='Updated md5checksums', html_url='https://github.com/ANHIG/IMGTHLA/commit/5e1d9764c8e96749bc11d52807eea1f7cc38ce5c'), input_parameters=None, status=None), ExecutionHistoryItem(version=3480, execution_date_utc=None, commit=Commit(sha='ecd63776c6225af0cf8bcc9fa9c6998d3129fb14', date_utc='2022-04-14T11:00:42Z', message='IPD-IMGT/HLA Release 3.48.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/ecd63776c6225af0cf8bcc9fa9c6998d3129fb14'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='cada41a6bfac5a8bf88ed2107a0b856b9b9785a0', date_utc='2022-03-15T14:27:59Z', message='Merge pull request #299 from ANHIG/3470\\n\\nUpdate to xml/hla.xml.zip correcting meta data', html_url='https://github.com/ANHIG/IMGTHLA/commit/cada41a6bfac5a8bf88ed2107a0b856b9b9785a0'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d', date_utc='2022-03-15T14:27:31Z', message='Update to xml/hla.xml.zip correcting meta data', html_url='https://github.com/ANHIG/IMGTHLA/commit/db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e', date_utc='2022-02-10T15:36:59Z', message='Merge pull request #296 from ANHIG/3470\\n\\nCorrection of HLA P Groups in hla.xml', html_url='https://github.com/ANHIG/IMGTHLA/commit/207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='8ab272288e81fa9a49fd4765579b066c2c03cc10', date_utc='2022-02-10T15:26:19Z', message='Correction of HLA P Groups in hla.xml', html_url='https://github.com/ANHIG/IMGTHLA/commit/8ab272288e81fa9a49fd4765579b066c2c03cc10'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='d4c410cd91618a8a74a7763598e499f6a8aa168b', date_utc='2022-01-14T15:25:23Z', message='Merge pull request #292 from ANHIG/3470\\n\\n3470', html_url='https://github.com/ANHIG/IMGTHLA/commit/d4c410cd91618a8a74a7763598e499f6a8aa168b'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='db46d2a0209720c5fa0ab8b03403f7009f69d2d6', date_utc='2022-01-14T12:27:52Z', message=\"Merge branch '3470' of github.com:ANHIG/IMGTHLA into 3470\", html_url='https://github.com/ANHIG/IMGTHLA/commit/db46d2a0209720c5fa0ab8b03403f7009f69d2d6'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='ebb3d8c6aa383567bb7794cacf745b1c82efbb7f', date_utc='2022-01-14T12:27:35Z', message='IPD-IMGT/HLA Release 3.47.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/ebb3d8c6aa383567bb7794cacf745b1c82efbb7f'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='fc712c5a4dd00f534e845982a29f46a14e22b292', date_utc='2022-01-14T08:05:58Z', message='Update README.md\\n\\nUpdate sponsorship contact and malformed section header', html_url='https://github.com/ANHIG/IMGTHLA/commit/fc712c5a4dd00f534e845982a29f46a14e22b292'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='239bf68f403bbdbabb43fa0277040f4dda2b2aff', date_utc='2022-01-13T20:33:38Z', message='Merge pull request #291 from ANHIG/3470\\n\\n3470', html_url='https://github.com/ANHIG/IMGTHLA/commit/239bf68f403bbdbabb43fa0277040f4dda2b2aff'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='d86bdd9983424c792691269fa10e6f022ccf2191', date_utc='2022-01-13T16:53:03Z', message='IPD-IMGT/HLA Release 3.47.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/d86bdd9983424c792691269fa10e6f022ccf2191'), input_parameters=None, status=None), ExecutionHistoryItem(version=3470, execution_date_utc=None, commit=Commit(sha='06ceff14b2db920d458dc337b1100dced992e627', date_utc='2022-01-13T16:52:15Z', message='IPD-IMGT/HLA Release 3.47.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/06ceff14b2db920d458dc337b1100dced992e627'), input_parameters=None, status=None)])})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_config" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# write config locally\n", + "with open(\"source-config.json\", \"w\") as f:\n", + " json.dump(source_config.dict(), f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/track_releases/source-config.json b/notebooks/track_releases/source-config.json new file mode 100644 index 00000000..9142b299 --- /dev/null +++ b/notebooks/track_releases/source-config.json @@ -0,0 +1,491 @@ +{ + "created_at_utc": "2023-04-25T02:35:07Z", + "updated_at_utc": "2023-04-25T02:35:07Z", + "repositories": { + "ANHIG/IMGTHLA": { + "owner": "ANHIG", + "name": "IMGTHLA", + "url": "https://github.com/ANHIG/IMGTHLA", + "tracked_assets": [ + "hla.dat", + "msf/" + ], + "default_input_parameters": { + "align": false, + "kir": false, + "mem_profile": false, + "limit": "1000" + }, + "execution_history": [ + { + "version": 3520, + "execution_date_utc": null, + "commit": { + "sha": "38398a75e9762ff070d8e9bd714d074332646cd7", + "date_utc": "2023-04-17T16:03:52Z", + "message": "Merge pull request #334 from ANHIG/3520\n\nIPD-IMGT/HLA Release 3.52.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3520, + "execution_date_utc": null, + "commit": { + "sha": "def376dc6955b339b17f0a4b840e80eb6b9c744b", + "date_utc": "2023-04-17T16:01:01Z", + "message": "IPD-IMGT/HLA Release 3.52.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b840e80eb6b9c744b" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3510, + "execution_date_utc": null, + "commit": { + "sha": "5f2c562056f8ffa89aeea0631f2a52300ee0de17", + "date_utc": "2023-01-13T10:04:48Z", + "message": "Merge pull request #325 from ANHIG/3510\n\n3510", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/5f2c562056f8ffa89aeea0631f2a52300ee0de17" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3510, + "execution_date_utc": null, + "commit": { + "sha": "4b8432c7d56121c84d6ef1d75a1c7185c628c13d", + "date_utc": "2023-01-12T14:47:00Z", + "message": "IPD-IMGT/HLA Database Release 3.51.0\n\nUpdated the ihiw folder for 3.51.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4b8432c7d56121c84d6ef1d75a1c7185c628c13d" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3510, + "execution_date_utc": null, + "commit": { + "sha": "4486f5c623705c6a14d9eeaba7d155cff30cdb43", + "date_utc": "2023-01-12T14:36:43Z", + "message": "IPD-IMGT/HLA Release 3.51.0\n\nUpdates for 3.51.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4486f5c623705c6a14d9eeaba7d155cff30cdb43" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "50b790037030d958b662085c3f4cf34ba72a32ec", + "date_utc": "2022-12-14T10:02:54Z", + "message": "Merge pull request #323 from ANHIG/3500\n\nCorrected missing date and version field in Allelelist_history.txt", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/50b790037030d958b662085c3f4cf34ba72a32ec" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "36220a1c5c2d6954f4873a552544cc0e55b61d0a", + "date_utc": "2022-12-14T10:02:28Z", + "message": "Corrected missing date and version field in Allelelist_history.txt", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/36220a1c5c2d6954f4873a552544cc0e55b61d0a" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "e941759874365cb152a3562c22d10847d10db326", + "date_utc": "2022-10-14T08:47:37Z", + "message": "Merge pull request #318 from ANHIG/3500\n\nhla.dat file for IPD-IMGT/HLA Release 3.50.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/e941759874365cb152a3562c22d10847d10db326" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "2d38d3313229fdc5f8aa00052a2db21b35be3d2d", + "date_utc": "2022-10-14T08:46:01Z", + "message": "hla.dat file for IPD-IMGT/HLA Release 3.50.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/2d38d3313229fdc5f8aa00052a2db21b35be3d2d" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "1ce31fc9e2805034578eff60a269c02176f03252", + "date_utc": "2022-10-13T13:06:12Z", + "message": "Merge pull request #317 from ANHIG/3500\n\nIPD-IMGT/HLA Release 3.50.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/1ce31fc9e2805034578eff60a269c02176f03252" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3500, + "execution_date_utc": null, + "commit": { + "sha": "2c631a4b61d529ff1c0635750888f6f6d79c2703", + "date_utc": "2022-10-13T12:58:37Z", + "message": "IPD-IMGT/HLA Release 3.50.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/2c631a4b61d529ff1c0635750888f6f6d79c2703" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816", + "date_utc": "2022-08-25T15:44:07Z", + "message": "Merge pull request #315 from ANHIG/3490\n\nRemoval of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3\u2026", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/12b4b94bb49b4640fa48a1182cc9b1e1fbf7e816" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "8f80f24d49797595d8a18b8d4d1f59846fbf3fe1", + "date_utc": "2022-08-25T15:43:12Z", + "message": "Removal of duplicate \"KW Human MHC;\" in hla.dat; Removal of empty 3'UTR in DQB1*03:480Q", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/8f80f24d49797595d8a18b8d4d1f59846fbf3fe1" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "a9680a9e80e2c119d4aa41f90e5f40cef6e7df02", + "date_utc": "2022-08-21T09:31:21Z", + "message": "Merge pull request #314 from ANHIG/3490\n\nCorrected DRA*01:01:01:01 P group (DRA*01:01P)", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/a9680a9e80e2c119d4aa41f90e5f40cef6e7df02" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "1fd937e6c0ff8452f64152aec9632f0586f981d1", + "date_utc": "2022-08-21T09:29:01Z", + "message": "Corrected DRA*01:01:01:01 P group (DRA*01:01P)", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/1fd937e6c0ff8452f64152aec9632f0586f981d1" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "50e92c677f4cd547c32bb9305269a762a9595074", + "date_utc": "2022-07-23T12:54:16Z", + "message": "Merge pull request #310 from ANHIG/3490\n\nAdded additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt \u2026", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/50e92c677f4cd547c32bb9305269a762a9595074" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "8732e7d4739a911e01f69d2c5bda82ca15ca68a9", + "date_utc": "2022-07-23T12:53:45Z", + "message": "Added additional space in DRB_prot.txt, DRB_nuc.txt and DRB1_gen.txt due to DRB1*15:200:01:01N and DRB1*15:200:01:02N", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/8732e7d4739a911e01f69d2c5bda82ca15ca68a9" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "536e8833f3be4bbcffbfba43cd0b3043c5bf4068", + "date_utc": "2022-07-15T08:21:31Z", + "message": "Merge pull request #308 from ANHIG/3490\n\n3490", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/536e8833f3be4bbcffbfba43cd0b3043c5bf4068" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "c90cc62006b35061c8db4b6f8da1b86de7650b23", + "date_utc": "2022-07-15T08:20:46Z", + "message": "Merge branch 'Latest' into 3490", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/c90cc62006b35061c8db4b6f8da1b86de7650b23" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "1a3be9a5d01a414854ff3bfacd5257c14adeefa2", + "date_utc": "2022-07-14T13:40:17Z", + "message": "IPD-IMGT/HLA Release 3.49.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/1a3be9a5d01a414854ff3bfacd5257c14adeefa2" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3490, + "execution_date_utc": null, + "commit": { + "sha": "4863b8f0a070d70836dfbdc00bdf70aa7bb66345", + "date_utc": "2022-07-14T12:31:36Z", + "message": "IPD-IMGT/HLA Release 3.49.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4863b8f0a070d70836dfbdc00bdf70aa7bb66345" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "72a9e28a52c9629dd63dfad5f215cdc562e2fd7e", + "date_utc": "2022-07-14T12:28:41Z", + "message": "IPD-IMGT/HLA Release 3.49.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/72a9e28a52c9629dd63dfad5f215cdc562e2fd7e" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "dddc6afc29de895f4131693e17b3cc856ff23f8d", + "date_utc": "2022-07-14T12:27:25Z", + "message": "IPD-IMGT/HLA Release 3.49.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/dddc6afc29de895f4131693e17b3cc856ff23f8d" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe", + "date_utc": "2022-04-20T08:39:00Z", + "message": "Merge pull request #305 from ANHIG/3480\n\nUpload of Allelelist_history.txt for IPD-IMGT/HLA Release 3.48.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/d1dd2ac5e2b6a8abef9e33ed397807ece52a22fe" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "4d33035dd296826bbde200e17e5281910ac8be13", + "date_utc": "2022-04-20T08:37:44Z", + "message": "Upload of Allelelist_history.txt for IPD-IMGT/HLA Release 3.48.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4d33035dd296826bbde200e17e5281910ac8be13" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "4a13df461c42f970a099de77377f309995e7995c", + "date_utc": "2022-04-19T08:30:24Z", + "message": "Merge branch '3480' into Latest", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/4a13df461c42f970a099de77377f309995e7995c" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "5e1d9764c8e96749bc11d52807eea1f7cc38ce5c", + "date_utc": "2022-04-14T11:01:59Z", + "message": "Updated md5checksums", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/5e1d9764c8e96749bc11d52807eea1f7cc38ce5c" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3480, + "execution_date_utc": null, + "commit": { + "sha": "ecd63776c6225af0cf8bcc9fa9c6998d3129fb14", + "date_utc": "2022-04-14T11:00:42Z", + "message": "IPD-IMGT/HLA Release 3.48.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/ecd63776c6225af0cf8bcc9fa9c6998d3129fb14" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "cada41a6bfac5a8bf88ed2107a0b856b9b9785a0", + "date_utc": "2022-03-15T14:27:59Z", + "message": "Merge pull request #299 from ANHIG/3470\n\nUpdate to xml/hla.xml.zip correcting meta data", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/cada41a6bfac5a8bf88ed2107a0b856b9b9785a0" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d", + "date_utc": "2022-03-15T14:27:31Z", + "message": "Update to xml/hla.xml.zip correcting meta data", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/db33aee5dc0b44251b64ff4c2e2f05d59e4e3a2d" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e", + "date_utc": "2022-02-10T15:36:59Z", + "message": "Merge pull request #296 from ANHIG/3470\n\nCorrection of HLA P Groups in hla.xml", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/207cdce7fef5dae54fdad4cc4f933ebd1ab13c5e" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "8ab272288e81fa9a49fd4765579b066c2c03cc10", + "date_utc": "2022-02-10T15:26:19Z", + "message": "Correction of HLA P Groups in hla.xml", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/8ab272288e81fa9a49fd4765579b066c2c03cc10" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "d4c410cd91618a8a74a7763598e499f6a8aa168b", + "date_utc": "2022-01-14T15:25:23Z", + "message": "Merge pull request #292 from ANHIG/3470\n\n3470", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/d4c410cd91618a8a74a7763598e499f6a8aa168b" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "db46d2a0209720c5fa0ab8b03403f7009f69d2d6", + "date_utc": "2022-01-14T12:27:52Z", + "message": "Merge branch '3470' of github.com:ANHIG/IMGTHLA into 3470", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/db46d2a0209720c5fa0ab8b03403f7009f69d2d6" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "ebb3d8c6aa383567bb7794cacf745b1c82efbb7f", + "date_utc": "2022-01-14T12:27:35Z", + "message": "IPD-IMGT/HLA Release 3.47.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/ebb3d8c6aa383567bb7794cacf745b1c82efbb7f" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "fc712c5a4dd00f534e845982a29f46a14e22b292", + "date_utc": "2022-01-14T08:05:58Z", + "message": "Update README.md\n\nUpdate sponsorship contact and malformed section header", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/fc712c5a4dd00f534e845982a29f46a14e22b292" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "239bf68f403bbdbabb43fa0277040f4dda2b2aff", + "date_utc": "2022-01-13T20:33:38Z", + "message": "Merge pull request #291 from ANHIG/3470\n\n3470", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/239bf68f403bbdbabb43fa0277040f4dda2b2aff" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "d86bdd9983424c792691269fa10e6f022ccf2191", + "date_utc": "2022-01-13T16:53:03Z", + "message": "IPD-IMGT/HLA Release 3.47.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/d86bdd9983424c792691269fa10e6f022ccf2191" + }, + "input_parameters": null, + "status": null + }, + { + "version": 3470, + "execution_date_utc": null, + "commit": { + "sha": "06ceff14b2db920d458dc337b1100dced992e627", + "date_utc": "2022-01-13T16:52:15Z", + "message": "IPD-IMGT/HLA Release 3.47.0", + "html_url": "https://github.com/ANHIG/IMGTHLA/commit/06ceff14b2db920d458dc337b1100dced992e627" + }, + "input_parameters": null, + "status": null + } + ] + } + } +} \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 6e3c4e27..11ccb816 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,14 @@ -requests -numpy -py-ard==0.9.1 -py-gfe==1.1.5 -lxml==4.9.1 -Pympler==0.9 -awscli==1.18.159 -boto3==1.15.3 \ No newline at end of file +requests~=2.31.0 +urllib3<2 +numpy~=1.26.4 +py-ard~=0.9.1 +py-gfe~=1.1.6 +lxml~=4.9.2 +Pympler~=0.9 +awscli~=1.18.159 +boto3~=1.15.3 +polars~=0.18.3 +aws-parameters~=0.1.8 +pygethub~=0.1.1 +pydantic~=2.0.0 +python-dotenv~=1.0.1