Skip to content

Commit c889f39

Browse files
Fix NRE for TimestampType and DateType and support nullable value types (#530)
1 parent 29ad2cb commit c889f39

File tree

3 files changed

+89
-23
lines changed

3 files changed

+89
-23
lines changed

src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ public void TestCreateDataFrame()
9494

9595
// Calling CreateDataFrame(IEnumerable<string> _) without schema
9696
{
97-
var data = new List<string>(new string[] { "Alice", "Bob" });
97+
var data = new string[] { "Alice", "Bob", null };
9898
StructType schema = SchemaWithSingleColumn(new StringType());
9999

100100
DataFrame df = _spark.CreateDataFrame(data);
@@ -103,7 +103,16 @@ public void TestCreateDataFrame()
103103

104104
// Calling CreateDataFrame(IEnumerable<int> _) without schema
105105
{
106-
var data = new List<int>(new int[] { 1, 2 });
106+
var data = new int[] { 1, 2 };
107+
StructType schema = SchemaWithSingleColumn(new IntegerType(), false);
108+
109+
DataFrame df = _spark.CreateDataFrame(data);
110+
ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
111+
}
112+
113+
// Calling CreateDataFrame(IEnumerable<int?> _) without schema
114+
{
115+
var data = new int?[] { 1, 2, null };
107116
StructType schema = SchemaWithSingleColumn(new IntegerType());
108117

109118
DataFrame df = _spark.CreateDataFrame(data);
@@ -112,7 +121,16 @@ public void TestCreateDataFrame()
112121

113122
// Calling CreateDataFrame(IEnumerable<double> _) without schema
114123
{
115-
var data = new List<double>(new double[] { 1.2, 2.3 });
124+
var data = new double[] { 1.2, 2.3 };
125+
StructType schema = SchemaWithSingleColumn(new DoubleType(), false);
126+
127+
DataFrame df = _spark.CreateDataFrame(data);
128+
ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
129+
}
130+
131+
// Calling CreateDataFrame(IEnumerable<double?> _) without schema
132+
{
133+
var data = new double?[] { 1.2, 2.3, null };
116134
StructType schema = SchemaWithSingleColumn(new DoubleType());
117135

118136
DataFrame df = _spark.CreateDataFrame(data);
@@ -121,19 +139,29 @@ public void TestCreateDataFrame()
121139

122140
// Calling CreateDataFrame(IEnumerable<bool> _) without schema
123141
{
124-
var data = new List<bool>(new bool[] { true, false });
142+
var data = new bool[] { true, false };
143+
StructType schema = SchemaWithSingleColumn(new BooleanType(), false);
144+
145+
DataFrame df = _spark.CreateDataFrame(data);
146+
ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
147+
}
148+
149+
// Calling CreateDataFrame(IEnumerable<bool?> _) without schema
150+
{
151+
var data = new bool?[] { true, false, null };
125152
StructType schema = SchemaWithSingleColumn(new BooleanType());
126153

127154
DataFrame df = _spark.CreateDataFrame(data);
128155
ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
129156
}
130-
157+
131158
// Calling CreateDataFrame(IEnumerable<Date> _) without schema
132159
{
133160
var data = new Date[]
134161
{
135162
new Date(2020, 1, 1),
136-
new Date(2020, 1, 2)
163+
new Date(2020, 1, 2),
164+
null
137165
};
138166
StructType schema = SchemaWithSingleColumn(new DateType());
139167

@@ -151,7 +179,8 @@ public void TestCreateDataFrameWithTimestamp()
151179
var data = new Timestamp[]
152180
{
153181
new Timestamp(2020, 1, 1, 0, 0, 0, 0),
154-
new Timestamp(2020, 1, 2, 15, 30, 30, 0)
182+
new Timestamp(2020, 1, 2, 15, 30, 30, 0),
183+
null
155184
};
156185
StructType schema = SchemaWithSingleColumn(new TimestampType());
157186

@@ -172,8 +201,9 @@ private void ValidateDataFrame(
172201
/// Returns a single column schema of the given datatype.
173202
/// </summary>
174203
/// <param name="dataType">Datatype of the column</param>
204+
/// <param name="isNullable">Indicates if values of the column can be null</param>
175205
/// <returns>Schema as StructType</returns>
176-
private StructType SchemaWithSingleColumn(DataType dataType) =>
177-
new StructType(new[] { new StructField("_1", dataType) });
206+
private StructType SchemaWithSingleColumn(DataType dataType, bool isNullable = true) =>
207+
new StructType(new[] { new StructField("_1", dataType, isNullable) });
178208
}
179209
}

src/csharp/Microsoft.Spark/Sql/SparkSession.cs

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,9 @@ public DataFrame Table(string tableName) =>
151151
new DataFrame((JvmObjectReference)_jvmObject.Invoke("table", tableName));
152152

153153
/// <summary>
154-
/// Creates a <see cref="DataFrame"/> from an <see cref="IEnumerable"/> containing
154+
/// Creates a <see cref="DataFrame"/> from an <see cref="IEnumerable"/> containing
155155
/// <see cref="GenericRow"/>s using the given schema.
156-
/// It is important to make sure that the structure of every <see cref="GenericRow"/> of
156+
/// It is important to make sure that the structure of every <see cref="GenericRow"/> of
157157
/// the provided <see cref="IEnumerable"/> matches
158158
/// the provided schema. Otherwise, there will be runtime exception.
159159
/// </summary>
@@ -172,22 +172,44 @@ public DataFrame CreateDataFrame(IEnumerable<GenericRow> data, StructType schema
172172
/// <param name="data"><see cref="IEnumerable"/> of type <see cref="int"/></param>
173173
/// <returns>Dataframe object</returns>
174174
public DataFrame CreateDataFrame(IEnumerable<int> data) =>
175+
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new IntegerType(), false));
176+
177+
/// <summary>
178+
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type
179+
/// <see cref="Nullable{Int32}"/>
180+
/// </summary>
181+
/// <param name="data"><see cref="IEnumerable"/> of type
182+
/// <see cref="Nullable{Int32}"/></param>
183+
/// <returns>Dataframe object</returns>
184+
public DataFrame CreateDataFrame(IEnumerable<int?> data) =>
175185
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new IntegerType()));
176186

177187
/// <summary>
178-
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type <see cref="string"/>
188+
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type
189+
/// <see cref="string"/>
179190
/// </summary>
180191
/// <param name="data"><see cref="IEnumerable"/> of type <see cref="string"/></param>
181192
/// <returns>Dataframe object</returns>
182193
public DataFrame CreateDataFrame(IEnumerable<string> data) =>
183194
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new StringType()));
184195

185196
/// <summary>
186-
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type <see cref="double"/>
197+
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type
198+
/// <see cref="double"/>
187199
/// </summary>
188200
/// <param name="data"><see cref="IEnumerable"/> of type <see cref="double"/></param>
189201
/// <returns>Dataframe object</returns>
190202
public DataFrame CreateDataFrame(IEnumerable<double> data) =>
203+
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new DoubleType(), false));
204+
205+
/// <summary>
206+
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type
207+
/// <see cref="Nullable{Double}"/>
208+
/// </summary>
209+
/// <param name="data"><see cref="IEnumerable"/> of type
210+
/// <see cref="Nullable{Double}"/></param>
211+
/// <returns>Dataframe object</returns>
212+
public DataFrame CreateDataFrame(IEnumerable<double?> data) =>
191213
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new DoubleType()));
192214

193215
/// <summary>
@@ -196,6 +218,16 @@ public DataFrame CreateDataFrame(IEnumerable<double> data) =>
196218
/// <param name="data"><see cref="IEnumerable"/> of type <see cref="bool"/></param>
197219
/// <returns>Dataframe object</returns>
198220
public DataFrame CreateDataFrame(IEnumerable<bool> data) =>
221+
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new BooleanType(), false));
222+
223+
/// <summary>
224+
/// Creates a Dataframe given data as <see cref="IEnumerable"/> of type
225+
/// <see cref="Nullable{Boolean}"/>
226+
/// </summary>
227+
/// <param name="data"><see cref="IEnumerable"/> of type
228+
/// <see cref="Nullable{Boolean}"/></param>
229+
/// <returns>Dataframe object</returns>
230+
public DataFrame CreateDataFrame(IEnumerable<bool?> data) =>
199231
CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new BooleanType()));
200232

201233
/// <summary>
@@ -299,9 +331,10 @@ public UdfRegistration Udf() =>
299331
/// Returns a single column schema of the given datatype.
300332
/// </summary>
301333
/// <param name="dataType">Datatype of the column</param>
334+
/// <param name="isNullable">Indicates if values of the column can be null</param>
302335
/// <returns>Schema as StructType</returns>
303-
private StructType SchemaWithSingleColumn(DataType dataType) =>
304-
new StructType(new[] { new StructField("_1", dataType) });
336+
private StructType SchemaWithSingleColumn(DataType dataType, bool isNullable = true) =>
337+
new StructType(new[] { new StructField("_1", dataType, isNullable) });
305338

306339
/// <summary>
307340
/// This method is transforming each element of IEnumerable of type T input into a single

src/csharp/Microsoft.Spark/Sql/Types/SimpleTypes.cs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ public sealed class DateType : AtomicType
8181
/// </summary>
8282
internal override object FromInternal(object obj)
8383
{
84+
if (obj == null)
85+
{
86+
return null;
87+
}
88+
8489
return new Date(new DateTime((int)obj * TimeSpan.TicksPerDay + s_unixTimeEpoch.Ticks));
8590
}
8691
}
@@ -101,16 +106,14 @@ public sealed class TimestampType : AtomicType
101106
/// </summary>
102107
internal override object FromInternal(object obj)
103108
{
104-
// Known issue that if the original type is "long" and its value can be fit into the
105-
// "int", Pickler will serialize the value as int.
106-
if (obj is long val)
109+
if (obj == null)
107110
{
108-
val = (long)obj;
109-
}
110-
else
111-
{
112-
val = (int)obj;
111+
return null;
113112
}
113+
114+
// Known issue that if the original type is "long" and its value can be fit into the
115+
// "int", Pickler will serialize the value as int.
116+
long val = (obj is long v) ? v : (int)obj;
114117
return new Timestamp(
115118
new DateTime(val * 10 + DateType.s_unixTimeEpoch.Ticks, DateTimeKind.Utc));
116119
}

0 commit comments

Comments
 (0)