-
Notifications
You must be signed in to change notification settings - Fork 61
Risklist module for production #631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
a563694
9750c3e
360f8f9
999a46f
372d9c8
16645bc
914ad76
c92bd8b
d3c3ba9
0e92fb0
dbd4578
f7d49e5
dee930f
1769b00
52c9ff0
173167a
f6b2d02
acffa67
43c1919
7dfb7e1
cc9fe4a
5951565
537f6c8
b429540
0045aa5
9dc3697
da870d5
6768ee5
7d6a420
b8fe6d8
8207fcd
45c9d68
a665e7e
ead882b
de85f10
ad860cd
40466d5
83c7385
f97089b
6f0af1c
42bccaa
3ec377f
1c4da24
35bd978
9f5a099
ba84822
83e0f66
d6f14f5
d76359b
9698500
a8a29f1
694edcc
583e9bd
815a258
5e183fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| from triage.risklist import generate_risk_list | ||
| from tests.utils import sample_config, populate_source_data | ||
| from triage.experiments import SingleThreadedExperiment | ||
| from triage.validation_primitives import table_should_have_data | ||
|
|
||
|
|
||
| def test_risklist(db_engine, project_storage): | ||
| # given a model id and as-of-date <= today | ||
| # and the model id is trained and is linked to an experiment with feature and cohort config | ||
| # generate records in listpredictions | ||
| # the # of records should equal the size of the cohort for that date | ||
| populate_source_data(db_engine) | ||
| SingleThreadedExperiment( | ||
| sample_config(), | ||
| db_engine=db_engine, | ||
| project_path=project_storage.project_path | ||
| ).run() | ||
|
|
||
| model_id = 1 | ||
| as_of_date = '2013-01-01' | ||
| generate_risk_list( | ||
| db_engine=db_engine, | ||
| matrix_storage_engine=project_storage.matrix_storage_engine(), | ||
| model_storage_engine=project_storage.model_storage_engine(), | ||
| model_id=model_id, | ||
| as_of_date=as_of_date) | ||
| table_should_have_data( | ||
| db_engine=db_engine, | ||
| table_name="production.list_predictions", | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| TrainEvaluation, | ||
| TestPrediction, | ||
| TrainPrediction, | ||
| ListPrediction | ||
| ) | ||
| from triage.util.pandas import downcast_matrix | ||
|
|
||
|
|
@@ -435,7 +436,7 @@ def columns(self, include_label=False): | |
| if include_label: | ||
| return columns | ||
| else: | ||
| return [col for col in columns if col != self.metadata["label_name"]] | ||
| return [col for col in columns if col != self.metadata.get("label_name", None)] | ||
|
|
||
| @property | ||
| def label_column_name(self): | ||
|
|
@@ -479,6 +480,8 @@ def matrix_type(self): | |
| return TrainMatrixType | ||
| elif self.metadata["matrix_type"] == "test": | ||
| return TestMatrixType | ||
| elif self.metadata["matrix_type"] == "production": | ||
| return ProductionMatrixType | ||
| else: | ||
| raise Exception( | ||
| """matrix metadata for matrix {} must contain 'matrix_type' | ||
|
|
@@ -525,7 +528,10 @@ def matrix_with_sorted_columns(self, columns): | |
|
|
||
| @property | ||
| def full_matrix_for_saving(self): | ||
| return self.design_matrix.assign(**{self.label_column_name: self.labels}) | ||
| if self.labels is not None: | ||
| return self.design_matrix.assign(**{self.label_column_name: self.labels}) | ||
| else: | ||
| return self.design_matrix | ||
|
|
||
| def load_metadata(self): | ||
| """Load metadata from storage""" | ||
|
|
@@ -644,3 +650,9 @@ class TrainMatrixType(object): | |
| evaluation_obj = TrainEvaluation | ||
| prediction_obj = TrainPrediction | ||
| is_test = False | ||
|
|
||
|
|
||
| class ProductionMatrixType(object): | ||
| string_name = "production" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| prediction_obj = ListPrediction | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're introducing a new matrix type, I think
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm... would it make sense to remove the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not fond of comparisons against magic strings. What about making it more specifically about the metric groups? Like what if the field was metric_config_key, and for train it was 'training_metric_groups', for test it was 'testing_metric_groups', and for production it was None?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that's a fair point about avoiding magic strings -- having it keep track of the appropriate
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, just coming into this discussion quite late. It seems to me that the discussion here is exposing decisions we committed to early on and haven't tried to address. I feel like we always saw the It also seems like which metrics to use is not properly a property of the matrix but of the task being performed. Perhaps we should be passing the metrics groups to the evaluator instead of encoding in the matrix what the evaluator should do to it. I think maybe we can move forward with this as a way to get the feature we want now, but this discussion is making the illogic of these matrix attributes more apparent and we may have some deeper technical debt to service here rather than trying to make the current situation totally coherent.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if you all discussed this on Tuesday, but here are a couple of possible solutions for Triage v 5.0.0. The first is probably easier to implement, but I think it is the less good solution:
An alternative:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved the above comment to #855 |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| """empty message | ||
|
|
||
| Revision ID: 1b990cbc04e4 | ||
| Revises: 0bca1ba9706e | ||
| Create Date: 2019-02-20 16:41:22.810452 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = '1b990cbc04e4' | ||
| down_revision = 'cfd5c3386014' | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.execute("CREATE SCHEMA IF NOT EXISTS production") | ||
| op.execute("ALTER TABLE model_metadata.list_predictions SET SCHEMA production;") | ||
|
|
||
|
|
||
| def downgrade(): | ||
| op.execute("ALTER TABLE production.list_predictions SET SCHEMA model_metadata;") | ||
| op.execute("DROP SCHEMA IF EXISTS production") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| """add label_value to prodcution table | ||
|
|
||
| Revision ID: 264786a9fe85 | ||
| Revises: 1b990cbc04e4 | ||
| Create Date: 2019-02-26 13:17:05.365654 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = '264786a9fe85' | ||
| down_revision = '1b990cbc04e4' | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.drop_table("list_predictions", schema="production") | ||
| op.create_table( | ||
| "list_predictions", | ||
| sa.Column("model_id", sa.Integer(), nullable=False), | ||
| sa.Column("entity_id", sa.BigInteger(), nullable=False), | ||
| sa.Column("as_of_date", sa.DateTime(), nullable=False), | ||
| sa.Column("score", sa.Numeric(), nullable=True), | ||
| sa.Column('label_value', sa.Integer, nullable=True), | ||
| sa.Column("rank_abs", sa.Integer(), nullable=True), | ||
| sa.Column("rank_pct", sa.Float(), nullable=True), | ||
| sa.Column("matrix_uuid", sa.Text(), nullable=True), | ||
| sa.Column("test_label_window", sa.Interval(), nullable=True), | ||
| sa.ForeignKeyConstraint(["model_id"], ["model_metadata.models.model_id"]), | ||
| sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), | ||
| schema="production", | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| op.drop_table("list_predictions", schema="production") | ||
| op.create_table( | ||
| "list_predictions", | ||
| sa.Column("model_id", sa.Integer(), nullable=False), | ||
| sa.Column("entity_id", sa.BigInteger(), nullable=False), | ||
| sa.Column("as_of_date", sa.DateTime(), nullable=False), | ||
| sa.Column("score", sa.Numeric(), nullable=True), | ||
| sa.Column("rank_abs", sa.Integer(), nullable=True), | ||
| sa.Column("rank_pct", sa.Float(), nullable=True), | ||
| sa.Column("matrix_uuid", sa.Text(), nullable=True), | ||
| sa.Column("test_label_window", sa.Interval(), nullable=True), | ||
| sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]), | ||
|
||
| sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), | ||
| schema="results", | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| "CREATE SCHEMA IF NOT EXISTS model_metadata;" | ||
| " CREATE SCHEMA IF NOT EXISTS test_results;" | ||
| " CREATE SCHEMA IF NOT EXISTS train_results;" | ||
| " CREATE SCHEMA IF NOT EXISTS production;" | ||
| ) | ||
|
|
||
| event.listen(Base.metadata, "before_create", DDL(schemas)) | ||
|
|
@@ -86,14 +87,15 @@ class ModelGroup(Base): | |
| class ListPrediction(Base): | ||
|
|
||
| __tablename__ = "list_predictions" | ||
|
||
| __table_args__ = {"schema": "model_metadata"} | ||
| __table_args__ = {"schema": "production"} | ||
|
|
||
| model_id = Column( | ||
| Integer, ForeignKey("model_metadata.models.model_id"), primary_key=True | ||
| ) | ||
| entity_id = Column(BigInteger, primary_key=True) | ||
| as_of_date = Column(DateTime, primary_key=True) | ||
| score = Column(Numeric) | ||
| label_value = Column(Integer) | ||
| rank_abs = Column(Integer) | ||
| rank_pct = Column(Float) | ||
| matrix_uuid = Column(Text) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This simple assertion was a good start but we should go further. Does it make sense to make assertions about the size of the table? How about the contents? We'd expect all of these rows to have the same date/model id and stuff like that, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, if I recall correctly the model_metadata.matrices table will also get a row since we used the MatrixBuilder, we should make sure that row looks reasonable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@thcrock