datakind · kaylawilding · Jul 25, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025
@@ -49,7 +49,7 @@ resources:
               - --course_dataset_validated_path
               - "{{tasks.data_ingestion.values.course_dataset_validated_path}}"
               - --toml_file_path
-              - "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/{{job.parameters.databricks_institution_name}}_{{job.parameters.model_name}}_configuration_file.toml"
+              - "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/config.toml"
               - --custom_schemas_path
               - "{{job.parameters.custom_schemas_path}}"
           job_cluster_key: pdp-inference-pipeline-cluster
@@ -73,7 +73,7 @@ resources:
               - --input_table_path
               - "{{tasks.data_preprocessing.values.processed_dataset_path}}"
               - --input_schema_path
-              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
+              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
               - --output_artifact_path
               - "{{tasks.data_ingestion.values.job_root_dir}}"
               - --environment
@@ -97,7 +97,7 @@ resources:
               - --input_table_path
               - "{{tasks.data_preprocessing.values.processed_dataset_path}}"
               - --input_schema_path
-              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
+              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
               - --output_artifact_path
               - "{{tasks.data_ingestion.values.job_root_dir}}"
               - --environment
@@ -135,7 +135,7 @@ resources:
               - --DK_CC_EMAIL
               - "{{job.parameters.DK_CC_EMAIL}}"
               - --modeling_table_path
-              - "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_gold.modeling_table"
+              - "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_silver.{{job.parameters.databricks_institution_name}}_pdp_modeling_ar_deid"
               - --custom_schemas_path
               - "{{job.parameters.custom_schemas_path}}"
           job_cluster_key: pdp-inference-pipeline-cluster
@@ -212,19 +212,19 @@ resources:
         enabled: true
       parameters:
         - name: cohort_file_name
-          default: kentucky_state_uni_pdp_ar_deid_20241029000400.csv 
+          default: AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
         - name: course_file_name
-          default: kentucky_state_uni_pdp_course_ar_deid_20241029000414_dedup.csv
+          default: AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
         - name: databricks_institution_name
-          default: kentucky_state_uni
+          default: midway_uni
         - name: db_run_id
           default: "{{job.run_id}}"
         - name: DB_workspace
           default: ${var.DB_workspace}
         - name: gcp_bucket_name
-          default: dev_6782b2f451f84c17ae6e14e918432b65
+          default: databricks-2052166062819251-unitycatalog
         - name: model_name
-          default: kentucky_state_uni_retention_end_of_first_year
+          default: midway_uni_graduation_4y_end_of_first_year
         - name: model_type
           default: sklearn
         - name: notification_email

@@ -20,6 +20,7 @@
 from databricks.sdk.runtime import dbutils
 from google.cloud import storage
 
+from student_success_tool.dataio import schemas
 import student_success_tool.dataio as dataio
 import importlib
 
@@ -227,14 +228,10 @@ def run(self):
         Executes the data ingestion task.
         """
         raw_files_path = f"{self.args.job_root_dir}/raw_files/"
-        # os.makedirs(raw_files_path, exist_ok=True)
         print("raw_files_path:", raw_files_path)
         dbutils.fs.mkdirs(raw_files_path)
 
-        # fpath_course, fpath_cohort = self.download_data_from_gcs(raw_files_path)
-        # Hack to get around gcp permissions right now
-        fpath_course = f"/Volumes/staging_sst_01/{args.databricks_institution_name}_bronze/bronze_volume/inference_inputs/{self.args.course_file_name}"
-        fpath_cohort = f"/Volumes/staging_sst_01/{args.databricks_institution_name}_bronze/bronze_volume/inference_inputs/{self.args.cohort_file_name}"
+        fpath_course, fpath_cohort = self.download_data_from_gcs(raw_files_path)
         df_course, df_cohort = self.read_and_validate_data(fpath_course, fpath_cohort)
 
         course_dataset_validated_path, cohort_dataset_validated_path = (
@@ -295,33 +292,41 @@ def parse_arguments() -> argparse.Namespace:
 
 if __name__ == "__main__":
     args = parse_arguments()
-    sys.path.append(args.custom_schemas_path)
+    # sys.path.append(args.custom_schemas_path)
     sys.path.append(
-        f"/Volumes/staging_sst_01/{args.databricks_institution_name}_bronze/bronze_volume/inference_inputs"
+        f"/Volumes/staging_sst_01/{args.databricks_institution_name}_gold/gold_volume/inference_inputs"
+    )
+    logging.info(
+        "Files in the inference inputs path: %s",
+        os.listdir(
+            f"/Volumes/staging_sst_01/{args.databricks_institution_name}_gold/gold_volume/inference_inputs"
+        ),
     )
     try:
-        print("Listdir1", os.listdir("/Workspace/Users"))
-        # converter_func = importlib.import_module(f"{args.databricks_institution_name}.dataio")
         converter_func = importlib.import_module("dataio")
-        course_converter_func = converter_func.converter_func_course
         cohort_converter_func = converter_func.converter_func_cohort
-        logging.info("Running task with custom converter func")
-    except ModuleNotFoundError:
-        print("Running task without custom converter func")
-        course_converter_func = None
+        logging.info("Running task with custom cohort converter func")
+    except Exception:
         cohort_converter_func = None
-        logging.info("Running task without custom converter func")
+        logging.info("Running task with default cohort converter func")
+    try:
+        converter_func = importlib.import_module("dataio")
+        course_converter_func = converter_func.converter_func_course
+        logging.info("Running task with custom course converter func")
+    except Exception:
+        course_converter_func = None
+        logging.info("Running task default course converter func")
     try:
-        print("sys.path:", sys.path)
-        # schemas = importlib.import_module(f"{args.databricks_institution_name}.schemas")
         schemas = importlib.import_module("schemas")
         logging.info("Running task with custom schema")
     except Exception:
-        print("Running task with default schema")
-        print("Exception", Exception)
         from student_success_tool.dataio.schemas import pdp as schemas
 
         logging.info("Running task with default schema")
 
-    task = DataIngestionTask(args)
+    task = DataIngestionTask(
+        args,
+        cohort_converter_func=cohort_converter_func,
+        course_converter_func=course_converter_func,
+    )
     task.run()
@@ -26,9 +26,10 @@
 
 # import student_success_tool.preprocessing.targets.pdp as targets
 from student_success_tool import preprocessing
-from student_success_tool.preprocessing import selection
+from student_success_tool.preprocessing import selection, checkpoints
 from student_success_tool.configs.pdp import PDPProjectConfig
 
+
 # Disable mlflow autologging (due to Databricks issues during feature selection)
 mlflow.autolog(disable=True)
 
@@ -126,16 +127,7 @@ def preprocess_data(
         """
 
         # Read preprocessing features from config
-        min_passing_grade = self.cfg.preprocessing.features.min_passing_grade
-        min_num_credits_full_time = (
-            self.cfg.preprocessing.features.min_num_credits_full_time
-        )
-        course_level_pattern = self.cfg.preprocessing.features.course_level_pattern
-        core_terms = self.cfg.preprocessing.features.core_terms
-        key_course_subject_areas = (
-            self.cfg.preprocessing.features.key_course_subject_areas
-        )
-        key_course_ids = self.cfg.preprocessing.features.key_course_ids
+        checkpoint_type = self.cfg.preprocessing.checkpoint.type_
 
         # Read preprocessing target parameters from config
         student_criteria = self.cfg.preprocessing.selection.student_criteria
@@ -145,23 +137,36 @@ def preprocess_data(
         df_student_terms = preprocessing.pdp.make_student_term_dataset(
             df_cohort,
             df_course,
-            min_passing_grade=min_passing_grade,
-            min_num_credits_full_time=min_num_credits_full_time,
-            course_level_pattern=course_level_pattern,
-            core_terms=core_terms,
-            key_course_subject_areas=key_course_subject_areas,
-            key_course_ids=key_course_ids,
+            min_passing_grade=self.cfg.preprocessing.features.min_passing_grade,
+            min_num_credits_full_time=self.cfg.preprocessing.features.min_num_credits_full_time,
+            course_level_pattern=self.cfg.preprocessing.features.course_level_pattern,
+            core_terms=self.cfg.preprocessing.features.core_terms,
+            key_course_subject_areas=self.cfg.preprocessing.features.key_course_subject_areas,
+            key_course_ids=self.cfg.preprocessing.features.key_course_ids,
         )
-        eligible_students = selection.pdp.select_students_by_attributes(
+
+        selected_students = selection.pdp.select_students_by_attributes(
             df_student_terms, student_id_cols=student_id_col, **student_criteria
         )
-        max_term_rank = df_student_terms["term_rank"].max()
+        if checkpoint_type == "nth":
+            logging.info("Checkpoint type: nth")
+            df_ckpt = checkpoints.pdp.nth_student_terms(
+                df_student_terms,
+                n=self.cfg.preprocessing.checkpoint.n,
+                sort_cols=self.cfg.preprocessing.checkpoint.sort_cols,
+                include_cols=self.cfg.preprocessing.checkpoint.include_cols,
+                enrollment_year_col="year_of_enrollment_at_cohort_inst",
+                valid_enrollment_year=1,
+            )
+        elif checkpoint_type == "first_at_num_credits_earned":
+            logging.info("Checkpoint type: first_at_num_credits_earned")
+            df_ckpt = checkpoints.pdp.first_student_terms_at_num_credits_earned(
+                df_student_terms,
+                min_num_credits=self.cfg.preprocessing.checkpoint.min_num_credits,
+            )
 
         df_processed = pd.merge(
-            df_student_terms.loc[df_student_terms["term_rank"].eq(max_term_rank), :],
-            eligible_students,
-            on=student_id_col,
-            how="inner",
+            df_ckpt, pd.Series(selected_students.index), how="inner", on=student_id_col
         )
 
         df_processed = preprocessing.pdp.clean_up_labeled_dataset_cols_and_vals(
@@ -261,7 +266,7 @@ def parse_arguments() -> argparse.Namespace:
     try:
         sys.path.append(args.custom_schemas_path)
         sys.path.append(
-            f"/Volumes/staging_sst_01/{args.databricks_institution_name}_bronze/bronze_volume/inference_inputs"
+            f"/Volumes/staging_sst_01/{args.databricks_institution_name}_gold/gold_volume/inference_inputs"
         )
         schemas = importlib.import_module("schemas")
         # schemas = importlib.import_module(f"{args.databricks_institution_name}.schemas")