[SPARK-21712][PYSPARK] Clarify type error for Column.substr()

nchammas · HyukjinKwon · commit 96608310501a · 2017-08-16T11:19:15.000+09:00
Proposed changes: * Clarify the type error that `Column.substr()` gives. Test plan: * Tested this manually. * Test code: ```python from pyspark.sql.functions import col, lit spark.createDataFrame([['nick']], schema=['name']).select(col('name').substr(0, lit(1))) ``` * Before: ``` TypeError: Can not mix the type ``` * After: ``` TypeError: startPos and length must be the same type. Got <class 'int'> and <class 'pyspark.sql.column.Column'>, respectively. ``` Author: Nicholas Chammas <nicholas.chammas@gmail.com> Closes apache#18926 from nchammas/SPARK-21712-substr-type-error.
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -406,8 +406,14 @@ def substr(self, startPos, length):
         [Row(col=u'Ali'), Row(col=u'Bob')]
         """
         if type(startPos) != type(length):
-            raise TypeError("Can not mix the type")
-        if isinstance(startPos, (int, long)):
+            raise TypeError(
+                "startPos and length must be the same type. "
+                "Got {startPos_t} and {length_t}, respectively."
+                .format(
+                    startPos_t=type(startPos),
+                    length_t=type(length),
+                ))
+        if isinstance(startPos, int):
             jc = self._jc.substr(startPos, length)
         elif isinstance(startPos, Column):
             jc = self._jc.substr(startPos._jc, length._jc)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1220,6 +1220,18 @@ def test_rand_functions(self):
         rndn2 = df.select('key', functions.randn(0)).collect()
         self.assertEqual(sorted(rndn1), sorted(rndn2))
 
+    def test_string_functions(self):
+        from pyspark.sql.functions import col, lit
+        df = self.spark.createDataFrame([['nick']], schema=['name'])
+        self.assertRaisesRegexp(
+            TypeError,
+            "must be the same type",
+            lambda: df.select(col('name').substr(0, lit(1))))
+        if sys.version_info.major == 2:
+            self.assertRaises(
+                TypeError,
+                lambda: df.select(col('name').substr(long(0), long(1))))
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains