forked from Jacoo-Zhao/AI-Studio-ClearML
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy paths2_data_preprocessing.py
More file actions
52 lines (44 loc) · 1.65 KB
/
s2_data_preprocessing.py
File metadata and controls
52 lines (44 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pickle
from clearml import Task, StorageManager
from sklearn.model_selection import train_test_split
# import pandas as pd
#
# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name="AI_Studio_Demo", task_name="Pipeline step 2 process dataset")
# program arguments
# Use either dataset_task_id to point to a tasks artifact or use a direct url with dataset_url
args = {
'dataset_task_id': '',
'random_state': 42,
'test_size': 0.2,
}
# store arguments, later we will be able to change them from outside the code
task.connect(args)
print('Arguments: {}'.format(args))
# only create the task, we will actually execute it later
task.execute_remotely()
#
# get dataset from task's artifact
if args['dataset_task_id']:
dataset_upload_task = Task.get_task(task_id=args['dataset_task_id'])
print('Input task id={} artifacts {}'.format(args['dataset_task_id'], list(dataset_upload_task.artifacts.keys())))
# download the artifact
iris_pickle = dataset_upload_task.artifacts['dataset'].get_local_copy()
else:
raise ValueError("Missing dataset link")
# open the local copy
iris = pickle.load(open(iris_pickle, 'rb'))
# "process" data
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args['test_size'], random_state=args['random_state'])
# upload processed data
print('Uploading process dataset')
task.upload_artifact('X_train', X_train)
task.upload_artifact('X_test', X_test)
task.upload_artifact('y_train', y_train)
task.upload_artifact('y_test', y_test)
print('Notice, artifacts are uploaded in the background')
print('Done🔥')