Merge pull request #93 from ciaran28/main

ciaran28 · web-flow · commit 883b1f553f0c · 2023-06-08T11:51:20.000+01:00
Bug Fix
diff --git a/.github/workflows/onRelease.yaml b/.github/workflows/onRelease.yaml
@@ -25,9 +25,9 @@ jobs:
 
           # IMPORTANT: The testing framework is not yet implemented, and therefore still under development. 
 
-          cd mlOps/devOps/utils
+          #cd mlOps/devOps/utils
 
-          python -m pytest -v
+          #python -m pytest -v
 
   prApproved_CD_Development:
     if:                         github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true && contains(github.head_ref, 'feature') && github.base_ref == 'main'
diff --git a/.github/workflows/taskDatabricks.yaml b/.github/workflows/taskDatabricks.yaml
@@ -220,7 +220,7 @@ jobs:
         name:                     Set Up DBX Environment Variables
         run: |
           DATABRICKS_TOKEN=$(az keyvault secret show --name "dbkstoken" --vault-name $AZ_KEYVAULT_NAME --query "value" -o tsv)
-          echo $DATABRICKS_TOKEN
+          #echo $DATABRICKS_TOKEN
           echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV
 
 
@@ -235,8 +235,8 @@ jobs:
           # not the Databricks AAD Token. 
           pip3 install dbx
 
-          echo $DATABRICKS_TOKEN
-          echo $DATABRICKS_HOST
+          #echo $DATABRICKS_TOKEN
+          #echo $DATABRICKS_HOST
 
           databricks -h 
           databricks fs ls
diff --git a/data_science/src_nyc_taxi/src.py b/data_science/src_nyc_taxi/src.py
@@ -18,7 +18,8 @@
         "feature_fraction": 0.9,
         "bagging_seed": 42,
         "verbosity": -1,
-        "seed": 42
+        "seed": 42,
+        "num_rounds": 100
     }
 )
 from registration import run_registration
diff --git a/data_science/src_nyc_taxi/training/__init__.py b/data_science/src_nyc_taxi/training/__init__.py
@@ -358,8 +358,6 @@ def evaluate(training_df,random_state, model):
         )
 
     evaluation_dict["r2"] = r2
-
-    #import pdb; pdb.set_trace()
     
     mlflow.log_metric(
         "r2",
@@ -398,11 +396,10 @@ def train_model_lgbm(
     # Collect data into a Pandas array for training
     data = training_df.toPandas()[features_and_label]
     train, test = train_test_split(data, random_state=123)
+    
     X_train = train.drop(["fare_amount"], axis=1)
     y_train = train.fare_amount
 
-
-
     mlflow.end_run()
     mlflow.autolog(exclusive=False)
     with mlflow.start_run():
@@ -416,29 +413,26 @@ def train_model_lgbm(
         #    label=y_test.values
         #    )
         
-        
+        num_rounds = model_params["num_rounds"]
+
         # Train a lightGBM model
         model = lgb.train(
-        #param, 
         model_params,
         train_lgb_dataset, 
         num_rounds
         )
 
-
+        mlflow.log_param("num_rounds", num_rounds)
         mlflow.log_param("local_model_file_path", model_file_path)  
 
         evaulation_dict = evaluate(
             training_df=training_df,
             random_state=123,
             model=model
             )
-
-        #import pdb; pdb.set_trace()
         
         mlflow.log_metrics(evaulation_dict)
     
-    
         fs.log_model(
             model,
             artifact_path="model_packaged",
@@ -621,7 +615,8 @@ def dbx_execute_functions():
             "feature_fraction": 0.9,
             "bagging_seed": 42,
             "verbosity": -1,
-            "seed": 42
+            "seed": 42,
+            "num_rounds": 100
         }
         )
 
diff --git a/experiments/notebooks/ciaran_experiments/nyc_taxi/nyc_taxi_lgbm_1.py b/experiments/notebooks/ciaran_experiments/nyc_taxi/nyc_taxi_lgbm_1.py
@@ -6,26 +6,26 @@
 # COMMAND ----------
 from training import run_training 
 
+num_rounds_arr = [20,40,60,80,100,120,140]
 
-
-
-
-run_training(
-    experiment_name = "ciaran_experiment_nyc_taxi",
-    model_name = "taxi_example_fare_packaged",
-    model_params = {
-        "objective": "regression",
-        "metric": "rmse",
-        "num_leaves": 25,
-        "learning_rate": 0.2,
-        "bagging_fraction": 0.9,
-        "feature_fraction": 0.9,
-        "bagging_seed": 42,
-        "verbosity": -1,
-        "seed": 42
-    }
-)
-from registration import run_registration
-run_registration(
-    model_name = "taxi_example_fare_packaged"
-)
+for num_rounds in num_rounds_arr:
+    run_training(
+        experiment_name = "ciaran_experiment_nyc_taxi",
+        model_name = "taxi_example_fare_packaged",
+        model_params = {
+            "objective": "regression",
+            "metric": "rmse",
+            "num_leaves": 25,
+            "learning_rate": 0.2,
+            "bagging_fraction": 0.9,
+            "feature_fraction": 0.9,
+            "bagging_seed": 42,
+            "verbosity": -1,
+            "seed": 42,
+            "num_rounds": num_rounds
+        }
+    )
+    from registration import run_registration
+    run_registration(
+        model_name = "taxi_example_fare_packaged"
+    )
diff --git a/infrastructure/bicep/az_templates/az_key_vault/az_key_vault.bicep b/infrastructure/bicep/az_templates/az_key_vault/az_key_vault.bicep
@@ -23,33 +23,6 @@ resource azKeyVault 'Microsoft.KeyVault/vaults@2021-10-01' = {
     enableSoftDelete: true
     enabledForTemplateDeployment: true
     accessPolicies: [
-        {
-          //applicationId: '<>' // Application ID of databricks SPN
-          permissions: {
-            // Give it the ability to set secrets // we can then get rid of the Key Vault Admin permission set in the main pipeline
-              // Can we do this for the main spn , the equivalent of serviceConnect1
-            secrets: [
-            'set'
-            'list'
-            'get'
-          ]
-          }
-          tenantId: subscription().tenantId
-          objectId: '<>'
-        }
-        
-        {
-        //applicationId: '<>' // Application ID of serviceConnect1
-        permissions: {
-          secrets: [
-            'set'
-            'list'
-            'get'
-          ]
-        }
-        tenantId: subscription().tenantId
-        objectId: '<>'
-      }
     ]
   }
   
diff --git a/infrastructure/databricks/databricks_utils/bash/utilsCreateDatabricksToken.sh b/infrastructure/databricks/databricks_utils/bash/utilsCreateDatabricksToken.sh
@@ -21,7 +21,7 @@ if [ $SECRET_EXISTS == true ]; then
                     --query "value" \
                     -o tsv )
 
-    echo "Secret Value: $DATABRICKS_TOKEN"
+    #echo "Secret Value: $DATABRICKS_TOKEN"
 
    # if [[ $DevOps_Agent == "GitHub" ]]; then
    #     echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV
diff --git a/infrastructure/databricks/databricks_utils/bash/utilsCreateRoleBasedAccess.sh b/infrastructure/databricks/databricks_utils/bash/utilsCreateRoleBasedAccess.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 
-echo "Resource Group Name: $RESOURCE_GROUP_NAME"
+#echo "Resource Group Name: $RESOURCE_GROUP_NAME"
 echo "ENVIRONMENT: $ENVIRONMENT"
 RESOURCE_GROUP_ID=$( az group show -n $RESOURCE_GROUP_NAME --query id -o tsv )
 
@@ -14,7 +14,7 @@ for row in $(echo "${JSON}" | jq -r '.RBAC_Assignments[] | @base64'); do
         echo ${row} | base64 --decode | jq -r ${1}
     }
     ROLES_ARRAY="$(_jq '.roles')"
-    echo $ROLES_ARRAY
+    #echo $ROLES_ARRAY
 
     # Before: [ "Contributor", "DBX_Custom_Role", "Key Vault Administrator" ]
     # xargs trims whitespace on either side. -n removes newline characters.
@@ -30,7 +30,8 @@ for row in $(echo "${JSON}" | jq -r '.RBAC_Assignments[] | @base64'); do
         --role "$ROLE" \
         --assignee-object-id $(_jq '.roleBeneficiaryObjID') \
         --assignee-principal-type "$(_jq '.principalType')" \
-        --scope "$RESOURCE_GROUP_ID"
+        --scope "$RESOURCE_GROUP_ID" \
+        -o none
         #--scope "$(_jq '.scope')"
 
     done    
diff --git a/infrastructure/databricks/databricks_utils/bash/utilsSetEnvVariables.sh b/infrastructure/databricks/databricks_utils/bash/utilsSetEnvVariables.sh
@@ -17,13 +17,13 @@ DATABRICKS_INSTANCE="$(az databricks workspace list -g $RESOURCE_GROUP_NAME --qu
 WORKSPACE_ID=$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].id" -o tsv)
 AZ_KEYVAULT_NAME=$(az keyvault list -g $RESOURCE_GROUP_NAME --query "[].name" -o tsv)
 SUBSCRIPTION_ID=$( az account show --query id -o tsv )
-echo $SUBSCRIPTION_ID
-echo $DATABRICKS_ORDGID
-echo $WORKSPACE_ID
-echo $AZ_KEYVAULT_NAME
-echo $SUBSCRIPTION_ID
-echo $AML_WS_NAME
-echo $DATABRICKS_WS_NAME
+#echo $SUBSCRIPTION_ID
+#echo $DATABRICKS_ORDGID
+#echo $WORKSPACE_ID
+#echo $AZ_KEYVAULT_NAME
+#echo $SUBSCRIPTION_ID
+#echo $AML_WS_NAME
+#echo $DATABRICKS_WS_NAME
 #DATABRICKS_TOKEN=$(az keyvault secret show --name "dbkstoken" --vault-name $AZ_KEYVAULT_NAME --query "value" -o tsv)
 
 
diff --git a/infrastructure/databricks/databricks_utils/python/utils_create_cluster.py b/infrastructure/databricks/databricks_utils/python/utils_create_cluster.py
@@ -33,7 +33,7 @@ def create_clusters():
     cluster_param_file = _ingest_cluster_param_file('infrastructure/databricks/databricks_configs/' + ENVIRONMENT + '/clusters.json') 
     existing_clusters, _ = _list_existing_clusters()
     existing_clusters_name_arr = _get_cluster_names(existing_clusters)
-    print(existing_clusters_name_arr)
+    #print(existing_clusters_name_arr)
     for cluster in cluster_param_file:
         if cluster['cluster_name'] not in existing_clusters_name_arr:
             print(f"Cluster {cluster} does not exist - Deploy.")
@@ -68,7 +68,7 @@ def _list_existing_clusters():
     status_code = response.status_code
     
     response_content = response.json()
-    print(response_content)
+    #print(response_content)
 
     if status_code != 200:
         raise Exception(status_code)
diff --git a/infrastructure/databricks/databricks_utils/python/utils_create_secret_scopes.py b/infrastructure/databricks/databricks_utils/python/utils_create_secret_scopes.py
@@ -29,11 +29,11 @@ def run_cmd(cmd):
     #May Need To Rmove shell=True
     process = subprocess.run(cmd, stdout=subprocess.PIPE)
     output = process.stdout.decode().split('\n')
-    print(output)
+    #print(output)
     output = [line.strip('\n').strip('\r') for line in output]
 
 
-    print(f"Return Code: {process.returncode}")
+    #print(f"Return Code: {process.returncode}")
     if process.returncode != 0:
         raise RuntimeError('\n'.join(output))
     return output
@@ -63,11 +63,11 @@ def create_secret_scopes(scope_name=str, initial_manage_principal=str):
         'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/scopes/create', headers=DBRKS_REQ_HEADERS, json=postjson
     )
 
-    print(response.status_code)
+    #print(response.status_code)
     #if response.status_code != 200:
     #    raise Exception(response.text)
 
-    print(response.json())
+    #print(response.json())
 
 def insert_secret(secret_value=str, scope_name=str, key=str):
     """
@@ -82,18 +82,18 @@ def insert_secret(secret_value=str, scope_name=str, key=str):
     response = requests.post(
         'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/put', headers=DBRKS_REQ_HEADERS, json=postjson
     )
-    print(response.status_code)
+    #print(response.status_code)
     if response.status_code != 200:
         raise Exception(response.text)
 
-    print(response.json())
+    #print(response.json())
     
 
 if __name__ == '__main__':
     app_insight_name = get_app_insight_name()[0]
-    print(app_insight_name)
+    #print(app_insight_name)
     app_insight_key = get_app_insight_key(app_insight_name)[0]
-    print(app_insight_key)
+    #print(app_insight_key)
 
 
     # Create Secret Scopes
diff --git a/infrastructure/databricks/databricks_utils/python/utils_git_configuration.py b/infrastructure/databricks/databricks_utils/python/utils_git_configuration.py
@@ -23,19 +23,19 @@ def configureGit(gitConfig, workspaceId, databricksInstance, bearerToken, manage
         }
     
     gitConfig.update(newData)
-    print(gitConfig)
-    print(DBRKS_REQ_HEADERS)
+    #print(gitConfig)
+    #print(DBRKS_REQ_HEADERS)
 
     response = requests.post('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS, json=gitConfig)
-    print(response)
+    #print(response)
     #print(response.json())
 
     if response.status_code != 200:
 
         response = requests.get('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS)
-        print(response.json())
+        #print(response.json())
         credential = response.json()["credentials"][0]["credential_id"]
-        print(f"Credential is {credential}")
+        #print(f"Credential is {credential}")
         response = requests.patch('https://' + databricksInstance + '/api/2.0/git-credentials/'+ str(credential), headers=DBRKS_REQ_HEADERS, json=gitConfig)
     
     #print(response.json())
@@ -47,12 +47,12 @@ def configureGit(gitConfig, workspaceId, databricksInstance, bearerToken, manage
 
     gitConfigs = json['Git_Configuration']
 
-    print(os.environ['WORKSPACE_ID'])
-    print(os.environ['DATABRICKS_INSTANCE'])
-    print(os.environ['DATABRICKS_AAD_TOKEN'])
-    print(os.environ['DATABRICKS_MANAGEMENT_TOKEN'])
-    print(os.environ['PAT_GITHUB'])
-    print(os.environ['ENVIRONMENT'])
+    #print(os.environ['WORKSPACE_ID'])
+    #print(os.environ['DATABRICKS_INSTANCE'])
+    #print(os.environ['DATABRICKS_AAD_TOKEN'])
+    #print(os.environ['DATABRICKS_MANAGEMENT_TOKEN'])
+    #print(os.environ['PAT_GITHUB'])
+    #print(os.environ['ENVIRONMENT'])
     for gitConfig in gitConfigs:
         response = configureGit(gitConfig=gitConfig, 
                                 workspaceId=os.environ['WORKSPACE_ID'], 
diff --git a/infrastructure/databricks/databricks_utils/python/utils_repo_pull.py b/infrastructure/databricks/databricks_utils/python/utils_repo_pull.py
@@ -84,8 +84,8 @@ def update_repo(repo_id, update_branch):
     if response.status_code != 200:
         raise Exception(response.content)
     else:
-        print(f"Status Code: {response.status_code}")
-        print(response.json())
+        #print(f"Status Code: {response.status_code}")
+        #print(response.json())
         return response.status_code
   
 
diff --git a/mlOps/nyc_taxi/aml_pipelines/v1/nyc_pipeline.py b/mlOps/nyc_taxi/aml_pipelines/v1/nyc_pipeline.py
diff --git a/mlOps/nyc_taxi/monitoring/mflow_experiment_dashboard_pbi.py b/mlOps/nyc_taxi/monitoring/mflow_experiment_dashboard_pbi.py

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,8 @@`
`18`	`18`	`"feature_fraction": 0.9,`
`19`	`19`	`"bagging_seed": 42,`
`20`	`20`	`"verbosity": -1,`
`21`		`- "seed": 42`
	`21`	`+ "seed": 42,`
	`22`	`+ "num_rounds": 100`
`22`	`23`	`}`
`23`	`24`	`)`
`24`	`25`	`from registration import run_registration`