Update to version 1.6

tobyroseman · tobyroseman · commit 3e5998781028 · 2015-09-23T13:24:19.000-07:00
diff --git a/oss_local_scripts/conda_requirements.txt b/oss_local_scripts/conda_requirements.txt
@@ -16,5 +16,6 @@ scikit-learn==0.16.1
 scipy==0.15.1
 six==1.9.0
 tornado==4.1
+wheel==0.24.0
 statsmodels
 PIL
diff --git a/oss_src/sframe/CMakeLists.txt b/oss_src/sframe/CMakeLists.txt
@@ -55,6 +55,6 @@ make_copy_target(_local_sys_util_
                   pylambda
  )
 add_dependencies(spark_unity _local_sys_util_)
-file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.1.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
-      EXPECTED_MD5 f5e0653648f0b474cee57bd134d9ee83)
+file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.3.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar
+      EXPECTED_MD5 b6261c8614da3da1c89dce64fed09420)
 
diff --git a/oss_src/sframe/spark_unity.cpp b/oss_src/sframe/spark_unity.cpp
@@ -625,12 +625,8 @@ int concat_main(std::string & _output_directory, std::string & _prefix) {
 
   }
   
-  auto first_sframe_ptr = std::make_shared<sframe>(list_filenames[0]);
-  sframe append_sframe;
-  append_sframe.open_for_write(first_sframe_ptr->column_names(),first_sframe_ptr->column_types(), "", 1, false);
-  append_sframe.close();
-
-  for(int index=0;index<list_filenames.size();index++) { 
+  sframe append_sframe(list_filenames[0]);
+  for(int index=1;index<list_filenames.size();index++) { 
       auto sframe_ptr = std::make_shared<sframe>(list_filenames[index]);
       append_sframe = append_sframe.append(*sframe_ptr); 
   }
diff --git a/oss_src/unity/python/doc/source/graphlab.data_structures.connectors.rst b/oss_src/unity/python/doc/source/graphlab.data_structures.connectors.rst
@@ -40,7 +40,6 @@ Spark RDD
 
   SFrame.from_rdd
   SFrame.to_rdd
-  SFrame.to_schema_rdd
 
 SQL Database
 ----------------
diff --git a/oss_src/unity/python/sframe/data_structures/image.py b/oss_src/unity/python/sframe/data_structures/image.py
@@ -223,7 +223,7 @@ def show(self):
         Displays the image. Requires PIL/Pillow.
 
         Alternatively, you can create an :class:`graphlab.SArray` of this image
-        and use :func:`graphlab.SArray.show()`
+        and use py:func:`graphlab.SArray.show()`
 
         See Also
         --------
diff --git a/oss_src/unity/python/sframe/data_structures/sframe.py b/oss_src/unity/python/sframe/data_structures/sframe.py
@@ -1697,10 +1697,11 @@ def to_spark_dataframe(self,sc,sql,number_of_partitions=4):
 
         >>> from pyspark import SparkContext, SQLContext
         >>> from graphlab import SFrame
+        >>> from pyspark.sql import SQLContext
         >>> sc = SparkContext('local')
-        >>> sqlc = SQLContext(sc)
+        >>> sql = SQLContext(sc)
         >>> sf = SFrame({'x': [1,2,3], 'y': ['fish', 'chips', 'salad']})
-        >>> df = sf.to_spark_dataframe(sc, sqlc)
+        >>> df = sf.to_spark_dataframe(sc, sql)
         >>> df.show()
         x y
         1 fish
@@ -1902,15 +1903,28 @@ def from_rdd(cls, rdd, cur_sc):
                 df, tmp_loc, finalSFramePrefix)
         else:
             if encoding == 'utf8':
-                finalSFrameFilename = graphlab_util_ref.toSFrame(
-                    rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
-            else:
-                # Prep the additional arguments to feed into the pySparkToSFrame function in Java
-                # that will call the spark_unity binary which does the actual encoding
-                additiona_args = os.path.join(" --encoding=%s " % encoding +\
-                                    " --type=rdd ")
-                finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
-                    rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)
+                ## TODO: This is a temporary solution. Here we are completely bypassing 
+                ## toSFrame() codepath when encoding is 'utf8'. This is because of Spark1.5 error
+                ## for closure cleaning issue on deep nested functions.
+
+                def f(iterator): 
+                    for obj in iterator:
+                        yield obj.encode("utf-8")
+
+                rdd = rdd.mapPartitions(f)
+                encoding = "batch"
+                if(rdd._jrdd_deserializer.__class__.__name__ == 'PickleSerializer'):
+                    encoding = "pickle"
+                
+                #finalSFrameFilename = graphlab_util_ref.toSFrame(
+                #    rdd._jrdd.rdd(),tmp_loc, finalSFramePrefix)
+            #else:
+            # Prep the additional arguments to feed into the pySparkToSFrame function in Java
+            # that will call the spark_unity binary which does the actual encoding
+            additiona_args = os.path.join(" --encoding=%s " % encoding +\
+                                " --type=rdd ")
+            finalSFrameFilename = graphlab_util_ref.pySparkToSFrame(
+                rdd._jrdd, tmp_loc, finalSFramePrefix, additiona_args)
 
         # Load and return the sframe
         sf = SFrame()
diff --git a/oss_src/unity/python/sframe/toolkits/_model.py b/oss_src/unity/python/sframe/toolkits/_model.py
@@ -427,10 +427,9 @@ def show(self, view=None, model_type='base'):
         view : str, optional
             The name of the Model view to show. Can be one of:
 
-            - 'Summary': The summary description of a Model.
+            - Summary: Shows the statistics of the training process such as size of the data and time cost. The summary also shows the parameters and settings for the model training process if available.
+            - Evaluation: Shows precision recall plot as line chart. Tooltip is provided for pointwise analysis. Precision recall values are shown in the tooltip at any given cutoff value the mouse points to.
 
-            - 'Evaluation': A visual representation of the evaluation results for
-                a Model.
 
         Returns
         -------

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,6 @@ make_copy_target(_local_sys_util_`
`55`	`55`	`pylambda`
`56`	`56`	`)`
`57`	`57`	`add_dependencies(spark_unity _local_sys_util_)`
`58`		`-file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.1.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar`
`59`		`- EXPECTED_MD5 f5e0653648f0b474cee57bd134d9ee83)`
	`58`	`+file(DOWNLOAD http://s3-us-west-2.amazonaws.com/glbin-engine/spark_unity_0.3.jar ${CMAKE_CURRENT_BINARY_DIR}/spark_unity.jar`
	`59`	`+ EXPECTED_MD5 b6261c8614da3da1c89dce64fed09420)`
`60`	`60`
Original file line number	Diff line number	Diff line change
`@@ -625,12 +625,8 @@ int concat_main(std::string & _output_directory, std::string & _prefix) {`
`625`	`625`
`626`	`626`	`}`
`627`	`627`
`628`		`- auto first_sframe_ptr = std::make_shared<sframe>(list_filenames[0]);`
`629`		`- sframe append_sframe;`
`630`		`- append_sframe.open_for_write(first_sframe_ptr->column_names(),first_sframe_ptr->column_types(), "", 1, false);`
`631`		`- append_sframe.close();`
`632`		`-`
`633`		`- for(int index=0;index<list_filenames.size();index++) {`
	`628`	`+ sframe append_sframe(list_filenames[0]);`
	`629`	`+ for(int index=1;index<list_filenames.size();index++) {`
`634`	`630`	`auto sframe_ptr = std::make_shared<sframe>(list_filenames[index]);`
`635`	`631`	`append_sframe = append_sframe.append(*sframe_ptr);`
`636`	`632`	`}`