dssg · shaycrk · Aug 27, 2021 · Feb 20, 2019 · Feb 20, 2019 · Feb 21, 2019
diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py
@@ -0,0 +1,30 @@
+from triage.risklist import generate_risk_list
+from tests.utils import sample_config, populate_source_data
+from triage.experiments import SingleThreadedExperiment
+from triage.validation_primitives import table_should_have_data
+
+
+def test_risklist(db_engine, project_storage):
+    # given a model id and as-of-date <= today
+    # and the model id is trained and is linked to an experiment with feature and cohort config
+    # generate records in listpredictions
+    # the # of records should equal the size of the cohort for that date
+    populate_source_data(db_engine)
+    SingleThreadedExperiment(
+        sample_config(),
+        db_engine=db_engine,
+        project_path=project_storage.project_path
+    ).run()
+
+    model_id = 1
+    as_of_date = '2013-01-01'
+    generate_risk_list(
+            db_engine=db_engine,
+            matrix_storage_engine=project_storage.matrix_storage_engine(),
+            model_storage_engine=project_storage.model_storage_engine(),
+            model_id=model_id,
+            as_of_date=as_of_date)
+    table_should_have_data(
+        db_engine=db_engine,
+        table_name="production.list_predictions",
+    )
diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py
@@ -29,6 +29,7 @@ def __init__(
         self.replace = replace
         self.include_missing_labels_in_train_as = include_missing_labels_in_train_as
         self.run_id = run_id
+        self.includes_labels = 'labels_table_name' in self.db_config
 
     @property
     def sessionmaker(self):
@@ -134,7 +135,7 @@ def make_entity_date_table(
         """
 
         as_of_time_strings = [str(as_of_time) for as_of_time in as_of_times]
-        if matrix_type == "test" or self.include_missing_labels_in_train_as is not None:
+        if matrix_type == "test" or matrix_type == "production" or self.include_missing_labels_in_train_as is not None:
             indices_query = self._all_valid_entity_dates_query(
                 as_of_time_strings=as_of_time_strings, state=state
             )
@@ -253,17 +254,19 @@ def build_matrix(
             if self.run_id:
                 errored_matrix(self.run_id, self.db_engine)
             return
-        if not table_has_data(
-            "{}.{}".format(
-                self.db_config["labels_schema_name"],
-                self.db_config["labels_table_name"],
-            ),
-            self.db_engine,
-        ):
-            logging.warning("labels table is not populated, cannot build matrix")
+
+        if self.includes_labels:
+            if not table_has_data(
+                "{}.{}".format(
+                    self.db_config["labels_schema_name"],
+                    self.db_config["labels_table_name"],
+                ),
+                self.db_engine,
+            ):
+                logging.warning("labels table is not populated, cannot build matrix")
+                return
             if self.run_id:
                 errored_matrix(self.run_id, self.db_engine)
-            return
 
         matrix_store = self.matrix_storage_engine.get_store(matrix_uuid)
         if not self.replace and matrix_store.exists:
@@ -287,7 +290,7 @@ def build_matrix(
                 matrix_metadata["state"],
                 matrix_type,
                 matrix_uuid,
-                matrix_metadata["label_timespan"],
+                matrix_metadata.get("label_timespan", None),
             )
         except ValueError as e:
             logging.warning(
@@ -305,25 +308,31 @@ def build_matrix(
             as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid
         )
         logging.info(f"Feature data extracted for matrix {matrix_uuid}")
-        logging.info(
-            "Extracting label data from database into file for " "matrix %s",
-            matrix_uuid,
-        )
-        labels_df = self.load_labels_data(
-            label_name,
-            label_type,
-            entity_date_table_name,
-            matrix_uuid,
-            matrix_metadata["label_timespan"],
-        )
-        dataframes.insert(0, labels_df)
 
-        logging.info(f"Label data extracted for matrix {matrix_uuid}")
+        # dataframes add label_name
+
+        if self.includes_labels:
+            logging.info(
+                "Extracting label data from database into file for " "matrix %s",
+                matrix_uuid,
+            )
+            labels_df = self.load_labels_data(
+                label_name,
+                label_type,
+                entity_date_table_name,
+                matrix_uuid,
+                matrix_metadata["label_timespan"],
+            )
+            dataframes.insert(0, labels_df)
+            logging.info(f"Label data extracted for matrix {matrix_uuid}")
+        else:
+            labels_df = pandas.DataFrame(index=dataframes[0].index, columns=[label_name])
+            dataframes.insert(0, labels_df)
+
         # stitch together the csvs
         logging.info("Merging feature files for matrix %s", matrix_uuid)
         output = self.merge_feature_csvs(dataframes, matrix_uuid)
         logging.info(f"Features data merged for matrix {matrix_uuid}")
-
         matrix_store.metadata = matrix_metadata
         # store the matrix
         labels = output.pop(matrix_store.label_column_name)

diff --git a/src/triage/component/architect/feature_generators.py b/src/triage/component/architect/feature_generators.py
@@ -635,7 +635,7 @@ def _generate_agg_table_tasks_for(self, aggregation):
 
         return table_tasks
 
-    def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True):
+    def _generate_imp_table_tasks_for(self, aggregation, impute_cols=None, nonimpute_cols=None, drop_preagg=True):
         """Generate SQL statements for preparing, populating, and
         finalizing imputations, for each feature group table in the
         given aggregation.
@@ -685,8 +685,10 @@ def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True):
         with self.db_engine.begin() as conn:
             results = conn.execute(aggregation.find_nulls())
             null_counts = results.first().items()
-        impute_cols = [col for (col, val) in null_counts if val > 0]
-        nonimpute_cols = [col for (col, val) in null_counts if val == 0]
+        if impute_cols is None:
+            impute_cols = [col for (col, val) in null_counts if val > 0]
+        if nonimpute_cols is None:
+            nonimpute_cols = [col for (col, val) in null_counts if val == 0]
 
         # table tasks for imputed aggregation table, most of the work is done here
         # by collate's get_impute_create()

diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py
@@ -12,6 +12,7 @@
     TrainEvaluation,
     TestPrediction,
     TrainPrediction,
+    ListPrediction
 )
 from triage.util.pandas import downcast_matrix
 
@@ -435,7 +436,7 @@ def columns(self, include_label=False):
         if include_label:
             return columns
         else:
-            return [col for col in columns if col != self.metadata["label_name"]]
+            return [col for col in columns if col != self.metadata.get("label_name", None)]
 
     @property
     def label_column_name(self):
@@ -479,6 +480,8 @@ def matrix_type(self):
             return TrainMatrixType
         elif self.metadata["matrix_type"] == "test":
             return TestMatrixType
+        elif self.metadata["matrix_type"] == "production":
+            return ProductionMatrixType
         else:
             raise Exception(
                 """matrix metadata for matrix {} must contain 'matrix_type'
@@ -525,7 +528,10 @@ def matrix_with_sorted_columns(self, columns):
 
     @property
     def full_matrix_for_saving(self):
-        return self.design_matrix.assign(**{self.label_column_name: self.labels})
+        if self.labels is not None:
+            return self.design_matrix.assign(**{self.label_column_name: self.labels})
+        else:
+            return self.design_matrix
 
     def load_metadata(self):
         """Load metadata from storage"""
@@ -644,3 +650,9 @@ class TrainMatrixType(object):
     evaluation_obj = TrainEvaluation
     prediction_obj = TrainPrediction
     is_test = False
+
+
+class ProductionMatrixType(object):
+    string_name = "production"
+    prediction_obj = ListPrediction
+
diff --git a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py
@@ -0,0 +1,26 @@
+"""empty message
+
+Revision ID: 1b990cbc04e4
+Revises: 0bca1ba9706e
+Create Date: 2019-02-20 16:41:22.810452
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '1b990cbc04e4'
+down_revision = 'cfd5c3386014'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.execute("CREATE SCHEMA IF NOT EXISTS production")
+    op.execute("ALTER TABLE model_metadata.list_predictions SET SCHEMA production;")
+
+
+def downgrade():
+    op.execute("ALTER TABLE production.list_predictions SET SCHEMA model_metadata;")
+    op.execute("DROP SCHEMA IF EXISTS production")
diff --git a/...onent/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py b/...onent/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py
@@ -0,0 +1,54 @@
+"""add label_value to prodcution table
+
+Revision ID: 264786a9fe85
+Revises: 1b990cbc04e4
+Create Date: 2019-02-26 13:17:05.365654
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '264786a9fe85'
+down_revision = '1b990cbc04e4'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.drop_table("list_predictions", schema="production")
+    op.create_table(
+        "list_predictions",
+        sa.Column("model_id", sa.Integer(), nullable=False),
+        sa.Column("entity_id", sa.BigInteger(), nullable=False),
+        sa.Column("as_of_date", sa.DateTime(), nullable=False),
+        sa.Column("score", sa.Numeric(), nullable=True),
+        sa.Column('label_value', sa.Integer, nullable=True),
+        sa.Column("rank_abs", sa.Integer(), nullable=True),
+        sa.Column("rank_pct", sa.Float(), nullable=True),
+        sa.Column("matrix_uuid", sa.Text(), nullable=True),
+        sa.Column("test_label_window", sa.Interval(), nullable=True),
+        sa.ForeignKeyConstraint(["model_id"], ["model_metadata.models.model_id"]),
+        sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"),
+        schema="production",
+    )
+
+
+def downgrade():
+    op.drop_table("list_predictions", schema="production")
+    op.create_table(
+        "list_predictions",
+        sa.Column("model_id", sa.Integer(), nullable=False),
+        sa.Column("entity_id", sa.BigInteger(), nullable=False),
+        sa.Column("as_of_date", sa.DateTime(), nullable=False),
+        sa.Column("score", sa.Numeric(), nullable=True),
+        sa.Column("rank_abs", sa.Integer(), nullable=True),
+        sa.Column("rank_pct", sa.Float(), nullable=True),
+        sa.Column("matrix_uuid", sa.Text(), nullable=True),
+        sa.Column("test_label_window", sa.Interval(), nullable=True),
+        sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]),
+        sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"),
+        schema="results",
+    )
+
diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py
@@ -31,6 +31,7 @@
     "CREATE SCHEMA IF NOT EXISTS model_metadata;"
     " CREATE SCHEMA IF NOT EXISTS test_results;"
     " CREATE SCHEMA IF NOT EXISTS train_results;"
+    " CREATE SCHEMA IF NOT EXISTS production;"
 )
 
 event.listen(Base.metadata, "before_create", DDL(schemas))
@@ -86,14 +87,15 @@ class ModelGroup(Base):
 class ListPrediction(Base):
 
     __tablename__ = "list_predictions"
-    __table_args__ = {"schema": "model_metadata"}
+    __table_args__ = {"schema": "production"}
 
     model_id = Column(
         Integer, ForeignKey("model_metadata.models.model_id"), primary_key=True
     )
     entity_id = Column(BigInteger, primary_key=True)
     as_of_date = Column(DateTime, primary_key=True)
     score = Column(Numeric)
+    label_value = Column(Integer)
     rank_abs = Column(Integer)
     rank_pct = Column(Float)
     matrix_uuid = Column(Text)