|
| 1 | +# simple Lambda function training a scikit-learn model on the digits classification dataset |
| 2 | +# see https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html |
| 3 | + |
| 4 | +import os |
| 5 | +import boto3 |
| 6 | +import numpy |
| 7 | +from sklearn import datasets, svm, metrics |
| 8 | +from sklearn.utils import Bunch |
| 9 | +from sklearn.model_selection import train_test_split |
| 10 | +from joblib import dump, load |
| 11 | +import io |
| 12 | + |
| 13 | + |
| 14 | +def handler(event, context): |
| 15 | + |
| 16 | + digits = load_digits() |
| 17 | + |
| 18 | + # flatten the images |
| 19 | + n_samples = len(digits.images) |
| 20 | + data = digits.images.reshape((n_samples, -1)) |
| 21 | + |
| 22 | + # Create a classifier: a support vector classifier |
| 23 | + clf = svm.SVC(gamma=0.001) |
| 24 | + |
| 25 | + # Split data into 50% train and 50% test subsets |
| 26 | + X_train, X_test, y_train, y_test = train_test_split( |
| 27 | + data, digits.target, test_size=0.5, shuffle=False |
| 28 | + ) |
| 29 | + |
| 30 | + # Learn the digits on the train subset |
| 31 | + clf.fit(X_train, y_train) |
| 32 | + |
| 33 | + # Dump the trained model to S3 |
| 34 | + s3_client = boto3.client("s3") |
| 35 | + buffer = io.BytesIO() |
| 36 | + dump(clf, buffer) |
| 37 | + s3_client.put_object(Body=buffer.getvalue(), Bucket="pods-test", Key="model.joblib") |
| 38 | + |
| 39 | + # Save the test-set to the S3 bucket |
| 40 | + numpy.save('test-set.npy', X_test) |
| 41 | + with open('test-set.npy', 'rb') as f: |
| 42 | + s3_client.put_object(Body=f, Bucket="pods-test", Key="test-set.npy") |
| 43 | + |
| 44 | + |
| 45 | +def load_digits(*, n_class=10, return_X_y=False, as_frame=False): |
| 46 | + # download files from S3 |
| 47 | + s3_client = boto3.client("s3") |
| 48 | + s3_client.download_file(Bucket="pods-test", Key="digits.csv.gz", Filename="digits.csv.gz") |
| 49 | + s3_client.download_file(Bucket="pods-test", Key="digits.rst", Filename="digits.rst") |
| 50 | + |
| 51 | + # code below based on sklearn/datasets/_base.py |
| 52 | + |
| 53 | + data = numpy.loadtxt('digits.csv.gz', delimiter=',') |
| 54 | + with open('digits.rst') as f: |
| 55 | + descr = f.read() |
| 56 | + target = data[:, -1].astype(numpy.int, copy=False) |
| 57 | + flat_data = data[:, :-1] |
| 58 | + images = flat_data.view() |
| 59 | + images.shape = (-1, 8, 8) |
| 60 | + |
| 61 | + if n_class < 10: |
| 62 | + idx = target < n_class |
| 63 | + flat_data, target = flat_data[idx], target[idx] |
| 64 | + images = images[idx] |
| 65 | + |
| 66 | + feature_names = ['pixel_{}_{}'.format(row_idx, col_idx) |
| 67 | + for row_idx in range(8) |
| 68 | + for col_idx in range(8)] |
| 69 | + |
| 70 | + frame = None |
| 71 | + target_columns = ['target', ] |
| 72 | + if as_frame: |
| 73 | + frame, flat_data, target = datasets._convert_data_dataframe( |
| 74 | + "load_digits", flat_data, target, feature_names, target_columns) |
| 75 | + |
| 76 | + if return_X_y: |
| 77 | + return flat_data, target |
| 78 | + |
| 79 | + return Bunch(data=flat_data, |
| 80 | + target=target, |
| 81 | + frame=frame, |
| 82 | + feature_names=feature_names, |
| 83 | + target_names=numpy.arange(10), |
| 84 | + images=images, |
| 85 | + DESCR=descr) |
0 commit comments