arcus-azure
diff --git a/‎.azureml/config.json‎
Lines changed: 3 additions & 3 deletions b/‎.azureml/config.json‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎arcus/azureml/environment/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎arcus/azureml/environment/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎arcus/azureml/environment/aml_environment.py‎
Lines changed: 87 additions & 26 deletions b/‎arcus/azureml/environment/aml_environment.py‎
Lines changed: 87 additions & 26 deletions
diff --git a/‎arcus/azureml/environment/environment_factory.py‎
Lines changed: 6 additions & 6 deletions b/‎arcus/azureml/environment/environment_factory.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎arcus/azureml/environment/errors.py‎
Lines changed: 13 additions & 0 deletions b/‎arcus/azureml/environment/errors.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎arcus/azureml/environment/local_environment.py‎
Lines changed: 0 additions & 90 deletions b/‎arcus/azureml/environment/local_environment.py‎
Lines changed: 0 additions & 90 deletions
diff --git a/‎arcus/azureml/experimenting/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎arcus/azureml/experimenting/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,5 @@
 {
-    "subscription_id": "",
-    "resource_group": "",
-    "workspace_name": ""
+    "subscription_id": "c1537527-c126-428d-8f72-1ac9f2c63c1f",
+    "resource_group": "codit-ai-incubators",
+    "workspace_name": "codit-ai-incubators-ml"
 }
@@ -144,4 +144,5 @@ outputs/*
 *.whl
 tests/resources/test_training/
 .azureml/config.json
-c
+c
+unit-test/*
@@ -1 +1 @@
-__all__ = ['aml_environment', 'environment_factory', 'environment', 'local_environment']
+__all__ = ['aml_environment', 'environment_factory', 'environment', 'errors']
@@ -5,21 +5,19 @@
 from azureml.data.datapath import DataPath
 import os
 import glob
-from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
-from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
 import arcus.azureml.environment.environment as env
 from arcus.azureml.experimenting import trainer
 from arcus.azureml.experimenting import aml_trainer
-from azureml.dataprep.api.functions import get_portable_path
-from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction
+from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
+from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
 
 class AzureMLEnvironment(env.WorkEnvironment):
     is_connected: bool = False
     __config_file: str = '.azureml/config.json'
     __workspace: Workspace = None
     __datastore_path: str = 'data'
 
-    def __init__(self, config_file: str = None, datastore_path: str = None, subscription_id: str = None, 
+    def __init__(self, config_file: str = None, datastore_path: str = None, subscription_id: str = None, connect_workspace: bool = True,
                 resource_group: str = None, workspace_name: str = None, write_config: bool = False, from_context: bool = False):
         '''
         This allows a user to specify to work connected or disconnected.  
@@ -48,58 +46,118 @@ def __init__(self, config_file: str = None, datastore_path: str = None, subscrip
                 if write_config:
                     self.__workspace.write_config(self.__config_file)
 
-            else:
+            elif connect_workspace:
                 # A config file is passed, so we'll validate the existance and connect
                 if not os.path.exists(self.__config_file):
                     raise FileNotFoundError('The config file ' + self.__config_file + ' does not exist.  Please verify and try again')
                 # There is a config file, so we'll connect
                 self.__connect_from_config_file(self.__config_file)
 
-        self.is_connected = True
+        self.is_connected = connect_workspace
 
-    def load_tabular_dataset(self, dataset_name: str) -> pd.DataFrame:
+    @classmethod
+    def CreateFromContext(cls, datastore_path: str = None):
+        '''
+        Creates a WorkEnvironment and returns the correct implementation, based on the configuration
+        Args:
+            datastore_path (str): the name of a DataStore in AzureML that contains Datasets
+        Returns: 
+            AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected.
+        '''   
+        return cls(datastore_path = datastore_path, from_context=True)
+
+    @classmethod
+    def Create(cls, subscription_id: str = None, resource_group: str = None, workspace_name: str = None, 
+                write_config: bool = False, config_file: str = None, datastore_path: str = None):
+        '''
+        Creates a WorkEnvironment and returns the correct implementation, based on the configuration
+        Args:
+            subscription_id (str): The subscription id where the AzureML service resides
+            resource_group (str): The resource group that contains the AzureML workspace
+            workspace_name (str): Name of the AzureML workspace
+            write_config (bool): If True, the WorkSpace configuration will be persisted in the given (or default) config file
+            config_file (str): The name of the config file (defaulting to .azureml/config.json) that contains the Workspace parameters
+            datastore_path (str): the name of a DataStore in AzureML that contains Datasets
+        Returns: 
+            AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected.
+        '''   
+        return cls(config_file = config_file, datastore_path = datastore_path, 
+                    subscription_id=subscription_id, resource_group=resource_group, 
+                    workspace_name= workspace_name, write_config = write_config)
+
+    def load_tabular_dataset(self, dataset_name: str, cloud_storage: bool = True) -> pd.DataFrame:
         '''
-        Loads a tabular dataset by a given name. the implementation will load the Dataset by name from the AzureML Workspace
+        Loads a tabular dataset by a given name. 
+            The implementation will load the Dataset by name from the AzureML Workspace
+            When configured locally, the data frame will be loaded from a file in the datastore_path with name {dataset_name}.csv
         Args:
             dataset_name (str): The name of the dataset to load
+            cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
         Returns:
             pd.DataFrame: The dataset, loaded as a DataFrame
         '''
         # Connecting data set
-        _dataset = Dataset.get_by_name(self.__workspace, name=dataset_name)
-        return _dataset.to_pandas_dataframe()
+        if cloud_storage:
+            _dataset = Dataset.get_by_name(self.__workspace, name=dataset_name)
+            return _dataset.to_pandas_dataframe()
+        else:
+            _file_name = os.path.join(self.__datastore_path, dataset_name + '.csv')
+            return pd.read_csv(_file_name)
 
-    def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False) -> pd.DataFrame:
+
+    def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame:
         '''
-        Loads a partition from a tabular dataset. the implementation will connect to the DataStore and get all delimited files matching the partition_name
+        Loads a partition from a tabular dataset. 
+            The implementation will connect to the DataStore and get all delimited files matching the partition_name
+            When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv
         Args:
             partition_name (str): The name of the partition as a wildcard filter.  Example: B* will take all files starting with B, ending with csv
             columns: (np.array): The column names to assign to the dataframe
-            datastore_path (str): The name of a DataStore in AzureML that contains Datasets
+            datastore_path (str): The name of a DataStore that contains Datasets
+            cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
         Returns:
             pd.DataFrame: The dataset, loaded as a DataFrame
         '''
         if not datastore_name:
             # No datastore name is given, so we'll take the default one
             datastore_name = self.__datastore_path
 
-        # Connecting data store
-        datastore = Datastore(self.__workspace, name=datastore_name)
-        try:
-            _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
-            _aml_dataset = Dataset.Tabular.from_delimited_files(header=_header,
-                path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns
-            _df = _aml_dataset.to_pandas_dataframe()
-        except DatasetValidationError as dsvalex:
-            if 'provided path is not valid' in str(dsvalex):
+        if cloud_storage:
+            # Connecting data store
+            datastore = Datastore(self.__workspace, name=datastore_name)
+            try:
+                _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
+                _aml_dataset = Dataset.Tabular.from_delimited_files(header=_header,
+                    path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns
+                _df = _aml_dataset.to_pandas_dataframe()
+            except DatasetValidationError as dsvalex:
+                if 'provided path is not valid' in str(dsvalex):
+                    return None
+                else:
+                    raise
+        else:
+            # Reading data from sub files in a folder
+            _folder_path = datastore_name
+            _partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv')
+            _record_found = False
+            _df = None
+            for filename in _partition_files:
+                _header = 0 if first_row_header else None
+                df = pd.read_csv(filename, index_col=None, header=_header)
+                if not _record_found:
+                    _df = df
+                    _record_found = True
+                else:
+                    _df = _df.append(df)
+
+            if not _record_found:
                 return None
-            else:
-                raise
+
         if columns != None:
             _df.columns = columns
         return _df
 
-    def start_experiment(self, name: str) -> trainer.Trainer:
+    def start_experiment(self, name: str) -> aml_trainer.AzureMLTrainer:
         '''
         Creates a new experiment (or connects to an existing one), using the give name
 
@@ -128,6 +186,9 @@ def __print_connection_info(self):
         print('>> Resource group:', self.__workspace.resource_group)
 
     def capture_filedataset_layout(self, dataset_name: str, output_path: str):
+        from azureml.dataprep.api.functions import get_portable_path
+        from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction
+
         dataset = self.__workspace.datasets[dataset_name]
         files_column = 'Path'
         PORTABLE_PATH = 'PortablePath'
 
@@ -8,19 +8,19 @@
 from azureml.data.dataset_error_handling import DatasetValidationError, DatasetExecutionError
 from azureml.data.dataset_type_definitions import PromoteHeadersBehavior
 import arcus.azureml.environment.aml_environment as aml
-import arcus.azureml.environment.local_environment as loc
 import arcus.azureml.environment.environment as env
+import arcus.azureml.environment.errors as errors
 
 class WorkEnvironmentFactory:
     __metaclass__ = ABCMeta
 
     @classmethod
-    def Create(cls, connected: bool = False, subscription_id: str = None, resource_group: str = None, workspace_name: str = None, 
+    def Create(cls, connected: bool = True, subscription_id: str = None, resource_group: str = None, workspace_name: str = None, 
                 write_config: bool = False, config_file: str = None, datastore_path: str = None):
         '''
         Creates a WorkEnvironment and returns the correct implementation, based on the configuration
         Args:
-            connected (bool): If connected, an aml_environment instance will be created, otherwise it will be a local_environment
+            connected (bool): Deprecated, should be True
             subscription_id (str): The subscription id where the AzureML service resides
             resource_group (str): The resource group that contains the AzureML workspace
             workspace_name (str): Name of the AzureML workspace
@@ -37,14 +37,14 @@ def Create(cls, connected: bool = False, subscription_id: str = None, resource_g
                                         subscription_id=subscription_id, resource_group=resource_group, 
                                         workspace_name= workspace_name, write_config = write_config)
         else:
-            return loc.LocalEnvironment(datastore_path=datastore_path)
+            raise errors.EnvironmentException('The creation of an environment is only supported in connected mode')
 
     @classmethod
     def CreateFromContext(cls, connected: bool = True, datastore_path: str = None):
         '''
         Creates a WorkEnvironment and returns the correct implementation, based on the configuration
         Args:
-            connected (bool): If connected, an aml_environment instance will be created, otherwise it will be a local_environment
+            connected (bool): Deprecated, should be True
             datastore_path (str): 
                 When working locally: the location of a folder name where datasets will be loading from
                 When working connected: the name of a DataStore in AzureML that contains Datasets
@@ -54,5 +54,5 @@ def CreateFromContext(cls, connected: bool = True, datastore_path: str = None):
         if(connected):
             return aml.AzureMLEnvironment(datastore_path = datastore_path, from_context=True)
         else:
-            return loc.LocalEnvironment(datastore_path=datastore_path)
+            raise errors.EnvironmentException('The creation of an environment is only supported in connected mode')
 
@@ -0,0 +1,13 @@
+
+class EnvironmentException(Exception):
+    def __init__(self, *args):
+        if args:
+            self.message = args[0]
+        else:
+            self.message = None
+    
+    def __str__(self):
+        if self.message:
+            return f'ML Environment exception: {self.message}'
+        else:
+            return f'ML Environment exception'
@@ -1 +1 @@
-__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors']
+__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors', 'train_environment']
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "subscription_id": "",`
`3`		`- "resource_group": "",`
`4`		`- "workspace_name": ""`
	`2`	`+ "subscription_id": "c1537527-c126-428d-8f72-1ac9f2c63c1f",`
	`3`	`+ "resource_group": "codit-ai-incubators",`
	`4`	`+ "workspace_name": "codit-ai-incubators-ml"`
`5`	`5`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__all__ = ['aml_environment', 'environment_factory', 'environment', 'local_environment']`
	`1`	`+__all__ = ['aml_environment', 'environment_factory', 'environment', 'errors']`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors']`
	`1`	`+__all__ = ['aml_trainer', 'local_trainer', 'trainer', 'tuning', 'errors', 'train_environment']`