Skip to content
This repository was archived by the owner on Aug 27, 2024. It is now read-only.

Commit a1d2032

Browse files
authored
Remove local execution (#67)
* added documentation * remove unused import from test * refactored to not use local_trainer any more * removed local trainer and enabled trainer from context * add required packages * added docs * remove local training test * fix overwrite & add docs * update sample * fix-create-training * provide support for Environment creation * enabled environment building * update docs & training script Co-authored-by: Sam <sam.vanhoutte@marchitec.be>
1 parent a3b3e08 commit a1d2032

27 files changed

+883
-1434
lines changed

.azureml/config.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"subscription_id": "",
3-
"resource_group": "",
4-
"workspace_name": ""
2+
"subscription_id": "c1537527-c126-428d-8f72-1ac9f2c63c1f",
3+
"resource_group": "codit-ai-incubators",
4+
"workspace_name": "codit-ai-incubators-ml"
55
}

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,5 @@ outputs/*
144144
*.whl
145145
tests/resources/test_training/
146146
.azureml/config.json
147-
c
147+
c
148+
unit-test/*
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__all__ = ['aml_environment', 'environment_factory', 'environment', 'local_environment']
1+
__all__ = ['aml_environment', 'environment_factory', 'environment', 'errors']

arcus/azureml/environment/aml_environment.py

Lines changed: 87 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,19 @@
55
from azureml.data.datapath import DataPath
66
import os
77
import glob
8-
from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
9-
from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
108
import arcus.azureml.environment.environment as env
119
from arcus.azureml.experimenting import trainer
1210
from arcus.azureml.experimenting import aml_trainer
13-
from azureml.dataprep.api.functions import get_portable_path
14-
from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction
11+
from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
12+
from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
1513

1614
class AzureMLEnvironment(env.WorkEnvironment):
1715
is_connected: bool = False
1816
__config_file: str = '.azureml/config.json'
1917
__workspace: Workspace = None
2018
__datastore_path: str = 'data'
2119

22-
def __init__(self, config_file: str = None, datastore_path: str = None, subscription_id: str = None,
20+
def __init__(self, config_file: str = None, datastore_path: str = None, subscription_id: str = None, connect_workspace: bool = True,
2321
resource_group: str = None, workspace_name: str = None, write_config: bool = False, from_context: bool = False):
2422
'''
2523
This allows a user to specify to work connected or disconnected.
@@ -48,58 +46,118 @@ def __init__(self, config_file: str = None, datastore_path: str = None, subscrip
4846
if write_config:
4947
self.__workspace.write_config(self.__config_file)
5048

51-
else:
49+
elif connect_workspace:
5250
# A config file is passed, so we'll validate the existance and connect
5351
if not os.path.exists(self.__config_file):
5452
raise FileNotFoundError('The config file ' + self.__config_file + ' does not exist. Please verify and try again')
5553
# There is a config file, so we'll connect
5654
self.__connect_from_config_file(self.__config_file)
5755

58-
self.is_connected = True
56+
self.is_connected = connect_workspace
5957

60-
def load_tabular_dataset(self, dataset_name: str) -> pd.DataFrame:
58+
@classmethod
59+
def CreateFromContext(cls, datastore_path: str = None):
60+
'''
61+
Creates a WorkEnvironment and returns the correct implementation, based on the configuration
62+
Args:
63+
datastore_path (str): the name of a DataStore in AzureML that contains Datasets
64+
Returns:
65+
AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected.
66+
'''
67+
return cls(datastore_path = datastore_path, from_context=True)
68+
69+
@classmethod
70+
def Create(cls, subscription_id: str = None, resource_group: str = None, workspace_name: str = None,
71+
write_config: bool = False, config_file: str = None, datastore_path: str = None):
72+
'''
73+
Creates a WorkEnvironment and returns the correct implementation, based on the configuration
74+
Args:
75+
subscription_id (str): The subscription id where the AzureML service resides
76+
resource_group (str): The resource group that contains the AzureML workspace
77+
workspace_name (str): Name of the AzureML workspace
78+
write_config (bool): If True, the WorkSpace configuration will be persisted in the given (or default) config file
79+
config_file (str): The name of the config file (defaulting to .azureml/config.json) that contains the Workspace parameters
80+
datastore_path (str): the name of a DataStore in AzureML that contains Datasets
81+
Returns:
82+
AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected.
83+
'''
84+
return cls(config_file = config_file, datastore_path = datastore_path,
85+
subscription_id=subscription_id, resource_group=resource_group,
86+
workspace_name= workspace_name, write_config = write_config)
87+
88+
def load_tabular_dataset(self, dataset_name: str, cloud_storage: bool = True) -> pd.DataFrame:
6189
'''
62-
Loads a tabular dataset by a given name. the implementation will load the Dataset by name from the AzureML Workspace
90+
Loads a tabular dataset by a given name.
91+
The implementation will load the Dataset by name from the AzureML Workspace
92+
When configured locally, the data frame will be loaded from a file in the datastore_path with name {dataset_name}.csv
6393
Args:
6494
dataset_name (str): The name of the dataset to load
95+
cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
6596
Returns:
6697
pd.DataFrame: The dataset, loaded as a DataFrame
6798
'''
6899
# Connecting data set
69-
_dataset = Dataset.get_by_name(self.__workspace, name=dataset_name)
70-
return _dataset.to_pandas_dataframe()
100+
if cloud_storage:
101+
_dataset = Dataset.get_by_name(self.__workspace, name=dataset_name)
102+
return _dataset.to_pandas_dataframe()
103+
else:
104+
_file_name = os.path.join(self.__datastore_path, dataset_name + '.csv')
105+
return pd.read_csv(_file_name)
71106

72-
def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False) -> pd.DataFrame:
107+
108+
def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame:
73109
'''
74-
Loads a partition from a tabular dataset. the implementation will connect to the DataStore and get all delimited files matching the partition_name
110+
Loads a partition from a tabular dataset.
111+
The implementation will connect to the DataStore and get all delimited files matching the partition_name
112+
When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv
75113
Args:
76114
partition_name (str): The name of the partition as a wildcard filter. Example: B* will take all files starting with B, ending with csv
77115
columns: (np.array): The column names to assign to the dataframe
78-
datastore_path (str): The name of a DataStore in AzureML that contains Datasets
116+
datastore_path (str): The name of a DataStore that contains Datasets
117+
cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
79118
Returns:
80119
pd.DataFrame: The dataset, loaded as a DataFrame
81120
'''
82121
if not datastore_name:
83122
# No datastore name is given, so we'll take the default one
84123
datastore_name = self.__datastore_path
85124

86-
# Connecting data store
87-
datastore = Datastore(self.__workspace, name=datastore_name)
88-
try:
89-
_header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
90-
_aml_dataset = Dataset.Tabular.from_delimited_files(header=_header,
91-
path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns
92-
_df = _aml_dataset.to_pandas_dataframe()
93-
except DatasetValidationError as dsvalex:
94-
if 'provided path is not valid' in str(dsvalex):
125+
if cloud_storage:
126+
# Connecting data store
127+
datastore = Datastore(self.__workspace, name=datastore_name)
128+
try:
129+
_header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
130+
_aml_dataset = Dataset.Tabular.from_delimited_files(header=_header,
131+
path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns
132+
_df = _aml_dataset.to_pandas_dataframe()
133+
except DatasetValidationError as dsvalex:
134+
if 'provided path is not valid' in str(dsvalex):
135+
return None
136+
else:
137+
raise
138+
else:
139+
# Reading data from sub files in a folder
140+
_folder_path = datastore_name
141+
_partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv')
142+
_record_found = False
143+
_df = None
144+
for filename in _partition_files:
145+
_header = 0 if first_row_header else None
146+
df = pd.read_csv(filename, index_col=None, header=_header)
147+
if not _record_found:
148+
_df = df
149+
_record_found = True
150+
else:
151+
_df = _df.append(df)
152+
153+
if not _record_found:
95154
return None
96-
else:
97-
raise
155+
98156
if columns != None:
99157
_df.columns = columns
100158
return _df
101159

102-
def start_experiment(self, name: str) -> trainer.Trainer:
160+
def start_experiment(self, name: str) -> aml_trainer.AzureMLTrainer:
103161
'''
104162
Creates a new experiment (or connects to an existing one), using the give name
105163
@@ -128,6 +186,9 @@ def __print_connection_info(self):
128186
print('>> Resource group:', self.__workspace.resource_group)
129187

130188
def capture_filedataset_layout(self, dataset_name: str, output_path: str):
189+
from azureml.dataprep.api.functions import get_portable_path
190+
from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction
191+
131192
dataset = self.__workspace.datasets[dataset_name]
132193
files_column = 'Path'
133194
PORTABLE_PATH = 'PortablePath'

arcus/azureml/environment/environment_factory.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,19 @@
88
from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
99
from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
1010
import arcus.azureml.environment.aml_environment as aml
11-
import arcus.azureml.environment.local_environment as loc
1211
import arcus.azureml.environment.environment as env
12+
import arcus.azureml.environment.errors as errors
1313

1414
class WorkEnvironmentFactory:
1515
__metaclass__ = ABCMeta
1616

1717
@classmethod
18-
def Create(cls, connected: bool = False, subscription_id: str = None, resource_group: str = None, workspace_name: str = None,
18+
def Create(cls, connected: bool = True, subscription_id: str = None, resource_group: str = None, workspace_name: str = None,
1919
write_config: bool = False, config_file: str = None, datastore_path: str = None):
2020
'''
2121
Creates a WorkEnvironment and returns the correct implementation, based on the configuration
2222
Args:
23-
connected (bool): If connected, an aml_environment instance will be created, otherwise it will be a local_environment
23+
connected (bool): Deprecated, should be True
2424
subscription_id (str): The subscription id where the AzureML service resides
2525
resource_group (str): The resource group that contains the AzureML workspace
2626
workspace_name (str): Name of the AzureML workspace
@@ -37,14 +37,14 @@ def Create(cls, connected: bool = False, subscription_id: str = None, resource_g
3737
subscription_id=subscription_id, resource_group=resource_group,
3838
workspace_name= workspace_name, write_config = write_config)
3939
else:
40-
return loc.LocalEnvironment(datastore_path=datastore_path)
40+
raise errors.EnvironmentException('The creation of an environment is only supported in connected mode')
4141

4242
@classmethod
4343
def CreateFromContext(cls, connected: bool = True, datastore_path: str = None):
4444
'''
4545
Creates a WorkEnvironment and returns the correct implementation, based on the configuration
4646
Args:
47-
connected (bool): If connected, an aml_environment instance will be created, otherwise it will be a local_environment
47+
connected (bool): Deprecated, should be True
4848
datastore_path (str):
4949
When working locally: the location of a folder name where datasets will be loading from
5050
When working connected: the name of a DataStore in AzureML that contains Datasets
@@ -54,5 +54,5 @@ def CreateFromContext(cls, connected: bool = True, datastore_path: str = None):
5454
if(connected):
5555
return aml.AzureMLEnvironment(datastore_path = datastore_path, from_context=True)
5656
else:
57-
return loc.LocalEnvironment(datastore_path=datastore_path)
57+
raise errors.EnvironmentException('The creation of an environment is only supported in connected mode')
5858

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
class EnvironmentException(Exception):
3+
def __init__(self, *args):
4+
if args:
5+
self.message = args[0]
6+
else:
7+
self.message = None
8+
9+
def __str__(self):
10+
if self.message:
11+
return f'ML Environment exception: {self.message}'
12+
else:
13+
return f'ML Environment exception'

arcus/azureml/environment/local_environment.py

Lines changed: 0 additions & 90 deletions
This file was deleted.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors']
1+
__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors', 'train_environment']

0 commit comments

Comments
 (0)