Skip to content

Commit 70073b1

Browse files
MaxGekkHyukjinKwon
authored andcommitted
[SPARK-27609][PYTHON] Convert values of function options to strings
## What changes were proposed in this pull request? In the PR, I propose to convert options values to strings by using `to_str()` for the following functions: `from_csv()`, `to_csv()`, `from_json()`, `to_json()`, `schema_of_csv()` and `schema_of_json()`. This will make handling of function options consistent to option handling in `DataFrameReader`/`DataFrameWriter`. For example: ```Python df.select(from_csv(df.value, "s string", {'ignoreLeadingWhiteSpace': True}) ``` ## How was this patch tested? Added an example for `from_csv()` which was tested by: ```Shell ./python/run-tests --testnames pyspark.sql.functions ``` Closes apache#25182 from MaxGekk/options_to_str. Authored-by: Maxim Gekk <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent eb5dc74 commit 70073b1

File tree

3 files changed

+30
-19
lines changed

3 files changed

+30
-19
lines changed

python/pyspark/sql/functions.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from pyspark.sql.types import StringType, DataType
3737
# Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409
3838
from pyspark.sql.udf import UserDefinedFunction, _create_udf
39+
from pyspark.sql.utils import to_str
3940

4041
# Note to developers: all of PySpark functions here take string as column names whenever possible.
4142
# Namely, if columns are referred as arguments, they can be always both Column or string,
@@ -114,6 +115,10 @@ def _():
114115
_.__doc__ = 'Window function: ' + doc
115116
return _
116117

118+
119+
def _options_to_str(options):
120+
return {key: to_str(value) for (key, value) in options.items()}
121+
117122
_lit_doc = """
118123
Creates a :class:`Column` of literal value.
119124
@@ -2343,7 +2348,7 @@ def from_json(col, schema, options={}):
23432348
schema = schema.json()
23442349
elif isinstance(schema, Column):
23452350
schema = _to_java_column(schema)
2346-
jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options)
2351+
jc = sc._jvm.functions.from_json(_to_java_column(col), schema, _options_to_str(options))
23472352
return Column(jc)
23482353

23492354

@@ -2384,7 +2389,7 @@ def to_json(col, options={}):
23842389
"""
23852390

23862391
sc = SparkContext._active_spark_context
2387-
jc = sc._jvm.functions.to_json(_to_java_column(col), options)
2392+
jc = sc._jvm.functions.to_json(_to_java_column(col), _options_to_str(options))
23882393
return Column(jc)
23892394

23902395

@@ -2415,7 +2420,7 @@ def schema_of_json(json, options={}):
24152420
raise TypeError("schema argument should be a column or string")
24162421

24172422
sc = SparkContext._active_spark_context
2418-
jc = sc._jvm.functions.schema_of_json(col, options)
2423+
jc = sc._jvm.functions.schema_of_json(col, _options_to_str(options))
24192424
return Column(jc)
24202425

24212426

@@ -2442,7 +2447,7 @@ def schema_of_csv(csv, options={}):
24422447
raise TypeError("schema argument should be a column or string")
24432448

24442449
sc = SparkContext._active_spark_context
2445-
jc = sc._jvm.functions.schema_of_csv(col, options)
2450+
jc = sc._jvm.functions.schema_of_csv(col, _options_to_str(options))
24462451
return Column(jc)
24472452

24482453

@@ -2464,7 +2469,7 @@ def to_csv(col, options={}):
24642469
"""
24652470

24662471
sc = SparkContext._active_spark_context
2467-
jc = sc._jvm.functions.to_csv(_to_java_column(col), options)
2472+
jc = sc._jvm.functions.to_csv(_to_java_column(col), _options_to_str(options))
24682473
return Column(jc)
24692474

24702475

@@ -2775,6 +2780,11 @@ def from_csv(col, schema, options={}):
27752780
>>> value = data[0][0]
27762781
>>> df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()
27772782
[Row(csv=Row(_c0=1, _c1=2, _c2=3))]
2783+
>>> data = [(" abc",)]
2784+
>>> df = spark.createDataFrame(data, ("value",))
2785+
>>> options = {'ignoreLeadingWhiteSpace': True}
2786+
>>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()
2787+
[Row(csv=Row(s=u'abc'))]
27782788
"""
27792789

27802790
sc = SparkContext._active_spark_context
@@ -2785,7 +2795,7 @@ def from_csv(col, schema, options={}):
27852795
else:
27862796
raise TypeError("schema argument should be a column or string")
27872797

2788-
jc = sc._jvm.functions.from_csv(_to_java_column(col), schema, options)
2798+
jc = sc._jvm.functions.from_csv(_to_java_column(col), schema, _options_to_str(options))
27892799
return Column(jc)
27902800

27912801

python/pyspark/sql/readwriter.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,23 +27,11 @@
2727
from pyspark.sql.column import _to_seq
2828
from pyspark.sql.types import *
2929
from pyspark.sql import utils
30+
from pyspark.sql.utils import to_str
3031

3132
__all__ = ["DataFrameReader", "DataFrameWriter"]
3233

3334

34-
def to_str(value):
35-
"""
36-
A wrapper over str(), but converts bool values to lower case strings.
37-
If None is given, just returns None, instead of converting it to string "None".
38-
"""
39-
if isinstance(value, bool):
40-
return str(value).lower()
41-
elif value is None:
42-
return value
43-
else:
44-
return str(value)
45-
46-
4735
class OptionUtils(object):
4836

4937
def _set_opts(self, schema=None, **options):

python/pyspark/sql/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,16 @@ def call(self, jdf, batch_id):
207207

208208
class Java:
209209
implements = ['org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchFunction']
210+
211+
212+
def to_str(value):
213+
"""
214+
A wrapper over str(), but converts bool values to lower case strings.
215+
If None is given, just returns None, instead of converting it to string "None".
216+
"""
217+
if isinstance(value, bool):
218+
return str(value).lower()
219+
elif value is None:
220+
return value
221+
else:
222+
return str(value)

0 commit comments

Comments
 (0)