Adding Random Cut Forest support to client libraries Python SDK (#161)

juliodelgadoaws · winstonaws · commit b92572083243 · 2018-04-25T14:29:47.000-07:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,11 @@
 CHANGELOG
 =========
 
+1.2.4-dev
+=====
+
+* feature: Estimators: add support for Amazon Random Cut Forest algorithm
+
 1.2.3
 =========
 * bug-fix: Fix local mode not using the right s3 bucket
diff --git a/README.rst b/README.rst
@@ -47,7 +47,7 @@ You can install from source by cloning this repository and issuing a pip install
 
     git clone https://github.com/aws/sagemaker-python-sdk.git
     python setup.py sdist
-    pip install dist/sagemaker-1.2.3.tar.gz
+    pip install dist/sagemaker-1.2.4.tar.gz
 
 Supported Python versions
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1575,7 +1575,7 @@ Amazon SageMaker provides several built-in machine learning algorithms that you
 
 The full list of algorithms is available on the AWS website: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
 
-SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA) and Neural Topic Model(NTM) algorithms.
+SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA), Neural Topic Model(NTM) and Random Cut Forest algorithms.
 
 Definition and usage
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/conf.py b/doc/conf.py
@@ -19,7 +19,7 @@ def __getattr__(cls, name):
                 'numpy', 'scipy', 'scipy.sparse']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
-version = '1.2.3'
+version = '1.2.4'
 project = u'sagemaker'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
diff --git a/doc/randomcutforest.rst b/doc/randomcutforest.rst
@@ -0,0 +1,22 @@
+Random Cut Forest
+--------------------
+
+The Amazon SageMaker Random Cut Forest algorithm.
+
+.. autoclass:: sagemaker.RandomCutForest
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+    :exclude-members: image, num_trees, num_samples_per_tree, eval_metrics, MINI_BATCH_SIZE
+
+
+.. autoclass:: sagemaker.RandomCutForestModel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. autoclass:: sagemaker.RandomCutForestPredictor
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@ def read(fname):
 
 
 setup(name="sagemaker",
-      version="1.2.3",
+      version="1.2.4",
       description="Open source library for training and deploying models on Amazon SageMaker.",
       packages=find_packages('src'),
       package_dir={'': 'src'},
diff --git a/src/sagemaker/__init__.py b/src/sagemaker/__init__.py
@@ -20,6 +20,7 @@
 from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
 from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
 from sagemaker.amazon.ntm import NTM, NTMModel, NTMPredictor
+from sagemaker.amazon.randomcutforest import RandomCutForest, RandomCutForestModel, RandomCutForestPredictor
 
 from sagemaker.local.local_session import LocalSession
 
@@ -36,5 +37,6 @@
            LinearLearnerModel, LinearLearnerPredictor,
            LDA, LDAModel, LDAPredictor,
            FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
+           RandomCutForest, RandomCutForestModel, RandomCutForestPredictor,
            Model, NTM, NTMModel, NTMPredictor, RealTimePredictor, Session, LocalSession,
            container_def, s3_input, production_variant, get_execution_role]
diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py
@@ -228,7 +228,8 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
 
 def registry(region_name, algorithm=None):
     """Return docker registry for the given AWS region"""
-    if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm"]:
+    if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm",
+                     "randomcutforest"]:
         account_id = {
             "us-east-1": "382416733822",
             "us-east-2": "404615174143",
diff --git a/src/sagemaker/amazon/randomcutforest.py b/src/sagemaker/amazon/randomcutforest.py
@@ -0,0 +1,126 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
+from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
+from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
+from sagemaker.amazon.validation import ge, le
+from sagemaker.predictor import RealTimePredictor
+from sagemaker.model import Model
+from sagemaker.session import Session
+
+
+class RandomCutForest(AmazonAlgorithmEstimatorBase):
+
+    repo_name = 'randomcutforest'
+    repo_version = 1
+    MINI_BATCH_SIZE = 1000
+
+    eval_metrics = hp(name='eval_metrics',
+                      validation_message='A comma separated list of "accuracy" or "precision_recall_fscore"',
+                      data_type=list)
+
+    num_trees = hp('num_trees', (ge(50), le(1000)), 'An integer in [50, 1000]', int)
+    num_samples_per_tree = hp('num_samples_per_tree', (ge(1), le(2048)), 'An integer in [1, 2048]', int)
+    feature_dim = hp("feature_dim", (ge(1), le(10000)), 'An integer in [1, 10000]', int)
+
+    def __init__(self, role, train_instance_count, train_instance_type,
+                 num_samples_per_tree=None, num_trees=None, eval_metrics=None, **kwargs):
+        """RandomCutForest is :class:`Estimator` used for anomaly detection.
+
+        This Estimator may be fit via calls to
+        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
+        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
+        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
+        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
+        to the `fit` call.
+
+        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
+        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
+
+        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
+        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an
+        Endpoint, deploy returns a :class:`~sagemaker.amazon.ntm.RandomCutForestPredictor` object that can be used
+        for inference calls using the trained model hosted in the SageMaker Endpoint.
+
+        RandomCutForest Estimators can be configured by setting hyperparameters. The available hyperparameters for
+        RandomCutForest are documented below.
+
+        For further information on the AWS Random Cut Forest algorithm,
+        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html
+
+        Args:
+            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
+                APIs that create Amazon SageMaker endpoints use this role to access
+                training data and model artifacts. After the endpoint is created,
+                the inference code might use the IAM role, if accessing AWS resource.
+            train_instance_count (int): Number of Amazon EC2 instances to use for training.
+            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+            num_samples_per_tree (int): Optional. The number of samples used to build each tree in the forest.
+                The total number of samples drawn from the train dataset is num_trees * num_samples_per_tree.
+            num_trees (int): Optional. The number of trees used in the forest.
+            eval_metrics(list): Optional. JSON list of metrics types to be used for reporting the score for the model.
+                Allowed values are "accuracy", "precision_recall_fscore": positive and negative precision, recall,
+                and f1 scores. If test data is provided, the score shall be reported in terms of all requested metrics.
+            **kwargs: base class keyword argument values.
+        """
+
+        super(RandomCutForest, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
+        self.num_samples_per_tree = num_samples_per_tree
+        self.num_trees = num_trees
+        self.eval_metrics = eval_metrics
+
+    def create_model(self):
+        """Return a :class:`~sagemaker.amazon.RandomCutForestModel` referencing the latest
+        s3 model data produced by this Estimator."""
+
+        return RandomCutForestModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
+
+    def fit(self, records, mini_batch_size=None, **kwargs):
+        if mini_batch_size is None:
+            mini_batch_size = RandomCutForest.MINI_BATCH_SIZE
+        elif mini_batch_size != RandomCutForest.MINI_BATCH_SIZE:
+            raise ValueError("Random Cut Forest uses a fixed mini_batch_size of {}"
+                             .format(RandomCutForest.MINI_BATCH_SIZE))
+        super(RandomCutForest, self).fit(records, mini_batch_size, **kwargs)
+
+
+class RandomCutForestPredictor(RealTimePredictor):
+    """Assigns an anomaly score to each of the datapoints provided.
+
+    The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
+    `RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
+    same number of columns as the feature-dimension of the data used to fit the model this
+    Predictor performs inference on.
+
+    :meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects,
+    one for each row in the input. Each row's score is stored in the key ``score`` of the
+    ``Record.label`` field."""
+
+    def __init__(self, endpoint, sagemaker_session=None):
+        super(RandomCutForestPredictor, self).__init__(endpoint, sagemaker_session,
+                                                       serializer=numpy_to_record_serializer(),
+                                                       deserializer=record_deserializer())
+
+
+class RandomCutForestModel(Model):
+    """Reference RandomCutForest s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an
+    Endpoint and returns a Predictor that calculates anomaly scores for datapoints."""
+
+    def __init__(self, model_data, role, sagemaker_session=None):
+        sagemaker_session = sagemaker_session or Session()
+        repo = '{}:{}'.format(RandomCutForest.repo_name, RandomCutForest.repo_version)
+        image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name,
+                                        RandomCutForest.repo_name), repo)
+        super(RandomCutForestModel, self).__init__(model_data, image, role,
+                                                   predictor_cls=RandomCutForestPredictor,
+                                                   sagemaker_session=sagemaker_session)
diff --git a/tests/integ/test_randomcutforest.py b/tests/integ/test_randomcutforest.py
@@ -0,0 +1,43 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import numpy as np
+
+from sagemaker import RandomCutForest, RandomCutForestModel
+from sagemaker.utils import name_from_base
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
+
+
+def test_randomcutforest(sagemaker_session):
+    with timeout(minutes=15):
+        # Generate a thousand 14-dimensional datapoints.
+        feature_num = 14
+        train_input = np.random.rand(1000, feature_num)
+
+        rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge',
+                              num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session,
+                              base_job_name='test-randomcutforest')
+
+        rcf.fit(rcf.record_set(train_input))
+
+    endpoint_name = name_from_base('randomcutforest')
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
+        model = RandomCutForestModel(rcf.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
+        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
+
+        predict_input = np.random.rand(1, feature_num)
+        result = predictor.predict(predict_input)
+
+        assert len(result) == 1
+        for record in result:
+            assert record.label["score"] is not None
+            assert len(record.label["score"].float32_tensor.values) == 1
diff --git a/tests/unit/test_randomcutforest.py b/tests/unit/test_randomcutforest.py