@@ -173,6 +173,36 @@ def __init__(self, name, description, format=None,
173173 else :
174174 self .data_pickle_file = None
175175
176+ def __str__ (self ):
177+ header = "OpenML Dataset"
178+ header = '{}\n {}\n ' .format (header , '=' * len (header ))
179+
180+ base_url = "{}" .format (openml .config .server [:- len ('api/v1/xml' )])
181+ fields = {"Name" : self .name ,
182+ "Version" : self .version ,
183+ "Format" : self .format ,
184+ "Licence" : self .licence ,
185+ "Download URL" : self .url ,
186+ "Data file" : self .data_file ,
187+ "Pickle file" : self .data_pickle_file ,
188+ "# of features" : len (self .features )}
189+ if self .upload_date is not None :
190+ fields ["Upload Date" ] = self .upload_date .replace ('T' , ' ' )
191+ if self .dataset_id is not None :
192+ fields ["OpenML URL" ] = "{}d/{}" .format (base_url , self .dataset_id )
193+ if self .qualities ['NumberOfInstances' ] is not None :
194+ fields ["# of instances" ] = int (self .qualities ['NumberOfInstances' ])
195+
196+ # determines the order in which the information will be printed
197+ order = ["Name" , "Version" , "Format" , "Upload Date" , "Licence" , "Download URL" ,
198+ "OpenML URL" , "Data File" , "Pickle File" , "# of features" , "# of instances" ]
199+ fields = [(key , fields [key ]) for key in order if key in fields ]
200+
201+ longest_field_name_length = max (len (name ) for name , value in fields )
202+ field_line_format = "{{:.<{}}}: {{}}" .format (longest_field_name_length )
203+ body = '\n ' .join (field_line_format .format (name , value ) for name , value in fields )
204+ return header + body
205+
176206 def _data_arff_to_pickle (self , data_file ):
177207 data_pickle_file = data_file .replace ('.arff' , '.pkl.py3' )
178208 if os .path .exists (data_pickle_file ):
@@ -368,9 +398,25 @@ def decode_arff(fh):
368398 def _convert_array_format (data , array_format , attribute_names ):
369399 """Convert a dataset to a given array format.
370400
371- By default, the data are stored as a sparse matrix or a pandas
372- dataframe. One might be interested to get a pandas SparseDataFrame or a
373- NumPy array instead, respectively.
401+ Converts to numpy array if data is non-sparse.
402+ Converts to a sparse dataframe if data is sparse.
403+
404+ Parameters
405+ ----------
406+ array_format : str {'array', 'dataframe'}
407+ Desired data type of the output
408+ - If array_format='array'
409+ If data is non-sparse
410+ Converts to numpy-array
411+ Enforces numeric encoding of categorical columns
412+ Missing values are represented as NaN in the numpy-array
413+ else returns data as is
414+ - If array_format='dataframe'
415+ If data is sparse
416+ Works only on sparse data
417+ Converts sparse data to sparse dataframe
418+ else returns data as is
419+
374420 """
375421 if array_format == "array" and not scipy .sparse .issparse (data ):
376422 # We encode the categories such that they are integer to be able
@@ -396,8 +442,11 @@ def _encode_if_category(column):
396442 'PyOpenML cannot handle string when returning numpy'
397443 ' arrays. Use dataset_format="dataframe".'
398444 )
399- if array_format == "dataframe" and scipy .sparse .issparse (data ):
445+ elif array_format == "dataframe" and scipy .sparse .issparse (data ):
400446 return pd .SparseDataFrame (data , columns = attribute_names )
447+ else :
448+ data_type = "sparse-data" if scipy .sparse .issparse (data ) else "non-sparse data"
449+ warn ("Cannot convert {} to '{}'. Returning input data." .format (data_type , array_format ))
401450 return data
402451
403452 @staticmethod
0 commit comments