Skip to content

Commit d54d87c

Browse files
committed
Merge branch 'master' into SPARK-55551-improve-broadcasthashjoinexec-output-partitioning
2 parents c64aa9a + b976e7b commit d54d87c

File tree

210 files changed

+5645
-2983
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

210 files changed

+5645
-2983
lines changed

.asf.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
github:
1919
description: "Apache Spark - A unified analytics engine for large-scale data processing"
2020
homepage: https://spark.apache.org/
21+
features:
22+
issues: true
2123
labels:
2224
- python
2325
- scala

.github/workflows/build_and_test.yml

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,11 +1104,14 @@ jobs:
11041104
- name: List Python packages for branch-3.5 and branch-4.0
11051105
if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
11061106
run: python3.9 -m pip list
1107+
- name: List Python packages for branch-4.1
1108+
if: inputs.branch == 'branch-4.1'
1109+
run: python3.11 -m pip list
11071110
- name: List Python packages
1108-
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
1111+
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' && inputs.branch != 'branch-4.1'
11091112
run: |
11101113
lsb_release -a
1111-
python3.11 -m pip list
1114+
python3.12 -m pip list
11121115
- name: Install dependencies for documentation generation
11131116
run: |
11141117
# Keep the version of Bundler here in sync with the following locations:
@@ -1139,8 +1142,8 @@ jobs:
11391142
echo "SKIP_SQLDOC: $SKIP_SQLDOC"
11401143
cd docs
11411144
bundle exec jekyll build
1142-
- name: Run documentation build
1143-
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
1145+
- name: Run documentation build for branch-4.1
1146+
if: inputs.branch == 'branch-4.1'
11441147
run: |
11451148
# We need this link to make sure `python3` points to `python3.11` which contains the prerequisite packages.
11461149
ln -s "$(which python3.11)" "/usr/local/bin/python3"
@@ -1163,6 +1166,30 @@ jobs:
11631166
echo "SKIP_SQLDOC: $SKIP_SQLDOC"
11641167
cd docs
11651168
bundle exec jekyll build
1169+
- name: Run documentation build
1170+
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' && inputs.branch != 'branch-4.1'
1171+
run: |
1172+
# We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages.
1173+
ln -s "$(which python3.12)" "/usr/local/bin/python3"
1174+
# Build docs first with SKIP_API to ensure they are buildable without requiring any
1175+
# language docs to be built beforehand.
1176+
cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
1177+
if [ -f "./dev/is-changed.py" ]; then
1178+
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
1179+
pyspark_modules=`cd dev && python3.12 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
1180+
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
1181+
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
1182+
fi
1183+
export PYSPARK_DRIVER_PYTHON=python3.12
1184+
export PYSPARK_PYTHON=python3.12
1185+
# Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
1186+
echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
1187+
echo "SKIP_SCALADOC: $SKIP_SCALADOC"
1188+
echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
1189+
echo "SKIP_RDOC: $SKIP_RDOC"
1190+
echo "SKIP_SQLDOC: $SKIP_SQLDOC"
1191+
cd docs
1192+
bundle exec jekyll build
11661193
- name: Tar documentation
11671194
if: github.repository != 'apache/spark'
11681195
run: tar cjf site.tar.bz2 docs/_site

.github/workflows/test_report.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ on:
2626

2727
jobs:
2828
test_report:
29-
if: "!contains(fromJson('[\"skipped\", \"cancelled\"]'), github.event.workflow_run.conclusion)"
29+
if: >
30+
github.event.workflow_run.path != '.github/workflows/pages.yml' &&
31+
!contains(fromJson('["skipped", "cancelled"]'), github.event.workflow_run.conclusion)
3032
runs-on: ubuntu-latest
3133
permissions:
3234
actions: read

common/utils/src/main/scala/org/apache/spark/SparkException.scala

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ private[spark] class SparkUpgradeException private(
198198
)
199199
}
200200

201-
def this(
201+
private[spark] def this(
202202
errorClass: String,
203203
messageParameters: Map[String, String],
204204
cause: Throwable,
@@ -243,7 +243,7 @@ private[spark] class SparkArithmeticException private(
243243
)
244244
}
245245

246-
def this(
246+
private[spark] def this(
247247
errorClass: String,
248248
messageParameters: Map[String, String],
249249
context: Array[QueryContext],
@@ -291,7 +291,7 @@ private[spark] class SparkUnsupportedOperationException private(
291291
)
292292
}
293293

294-
def this(
294+
private[spark] def this(
295295
errorClass: String,
296296
messageParameters: Map[String, String],
297297
sqlState: Option[String]) = {
@@ -417,7 +417,7 @@ private[spark] class SparkDateTimeException private(
417417
)
418418
}
419419

420-
def this(
420+
private[spark] def this(
421421
errorClass: String,
422422
messageParameters: Map[String, String],
423423
context: Array[QueryContext],
@@ -491,7 +491,7 @@ private[spark] class SparkNumberFormatException private(
491491
)
492492
}
493493

494-
def this(
494+
private[spark] def this(
495495
errorClass: String,
496496
messageParameters: Map[String, String],
497497
context: Array[QueryContext],
@@ -547,7 +547,7 @@ private[spark] class SparkIllegalArgumentException private(
547547
)
548548
}
549549

550-
def this(
550+
private[spark] def this(
551551
errorClass: String,
552552
messageParameters: Map[String, String],
553553
context: Array[QueryContext],
@@ -639,7 +639,7 @@ private[spark] class SparkRuntimeException private(
639639
)
640640
}
641641

642-
def this(
642+
private[spark] def this(
643643
errorClass: String,
644644
messageParameters: Map[String, String],
645645
cause: Throwable,
@@ -763,7 +763,7 @@ private[spark] class SparkArrayIndexOutOfBoundsException private(
763763
)
764764
}
765765

766-
def this(
766+
private[spark] def this(
767767
errorClass: String,
768768
messageParameters: Map[String, String],
769769
context: Array[QueryContext],

connector/docker-integration-tests/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ The following environment variables can be used to specify the custom Docker ima
5555
- MYSQL_DOCKER_IMAGE_NAME
5656
- ORACLE_DOCKER_IMAGE_NAME
5757
- POSTGRES_DOCKER_IMAGE_NAME
58+
- STARROCKS_DOCKER_IMAGE_NAME
5859

5960
## Using a custom Docker context
6061

connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
7979
}
8080

8181
test("Basic test") {
82-
val df = sqlContext.read.jdbc(jdbcUrl, "tbl", new Properties)
82+
val df = spark.read.jdbc(jdbcUrl, "tbl", new Properties)
8383
val rows = df.collect()
8484
assert(rows.length == 2)
8585
val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -91,7 +91,7 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
9191
test("Numeric types") {
9292
Seq(true, false).foreach { legacy =>
9393
withSQLConf(SQLConf.LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED.key -> legacy.toString) {
94-
val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties)
94+
val df = spark.read.jdbc(jdbcUrl, "numbers", new Properties)
9595
val rows = df.collect()
9696
assert(rows.length == 1)
9797
val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -131,7 +131,7 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
131131

132132
test("Date types") {
133133
withDefaultTimeZone(UTC) {
134-
val df = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties)
134+
val df = spark.read.jdbc(jdbcUrl, "dates", new Properties)
135135
val rows = df.collect()
136136
assert(rows.length == 1)
137137
val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -146,7 +146,7 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
146146
}
147147

148148
test("String types") {
149-
val df = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties)
149+
val df = spark.read.jdbc(jdbcUrl, "strings", new Properties)
150150
val rows = df.collect()
151151
assert(rows.length == 1)
152152
val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -164,20 +164,20 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
164164

165165
test("Basic write test") {
166166
// cast decflt column with precision value of 38 to DB2 max decimal precision value of 31.
167-
val df1 = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties)
167+
val df1 = spark.read.jdbc(jdbcUrl, "numbers", new Properties)
168168
.selectExpr("small", "med", "big", "deci", "flt", "dbl", "real",
169169
"cast(decflt as decimal(31, 5)) as decflt")
170-
val df2 = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties)
171-
val df3 = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties)
170+
val df2 = spark.read.jdbc(jdbcUrl, "dates", new Properties)
171+
val df3 = spark.read.jdbc(jdbcUrl, "strings", new Properties)
172172
df1.write.jdbc(jdbcUrl, "numberscopy", new Properties)
173173
df2.write.jdbc(jdbcUrl, "datescopy", new Properties)
174174
df3.write.jdbc(jdbcUrl, "stringscopy", new Properties)
175175
// spark types that does not have exact matching db2 table types.
176-
val df4 = sqlContext.createDataFrame(
176+
val df4 = spark.createDataFrame(
177177
sparkContext.parallelize(Seq(Row("1".toShort, "20".toByte))),
178178
new StructType().add("c1", ShortType).add("b", ByteType))
179179
df4.write.jdbc(jdbcUrl, "otherscopy", new Properties)
180-
val rows = sqlContext.read.jdbc(jdbcUrl, "otherscopy", new Properties).collect()
180+
val rows = spark.read.jdbc(jdbcUrl, "otherscopy", new Properties).collect()
181181
assert(rows(0).getShort(0) == 1)
182182
assert(rows(0).getShort(1) == 20)
183183
}
@@ -215,20 +215,20 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
215215
).map { case (x, y) =>
216216
Row(Integer.valueOf(x), String.valueOf(y))
217217
}
218-
val df = sqlContext.read.jdbc(jdbcUrl, "tbl", new Properties)
218+
val df = spark.read.jdbc(jdbcUrl, "tbl", new Properties)
219219
for (_ <- 0 to 2) {
220220
df.write.mode(SaveMode.Append).jdbc(jdbcUrl, "tblcopy", new Properties)
221221
}
222-
assert(sqlContext.read.jdbc(jdbcUrl, "tblcopy", new Properties).count() === 6)
222+
assert(spark.read.jdbc(jdbcUrl, "tblcopy", new Properties).count() === 6)
223223
df.write.mode(SaveMode.Overwrite).option("truncate", true)
224224
.jdbc(jdbcUrl, "tblcopy", new Properties)
225-
val actual = sqlContext.read.jdbc(jdbcUrl, "tblcopy", new Properties).collect()
225+
val actual = spark.read.jdbc(jdbcUrl, "tblcopy", new Properties).collect()
226226
assert(actual.length === 2)
227227
assert(actual.toSet === expectedResult)
228228
}
229229

230230
test("SPARK-42534: DB2 Limit pushdown test") {
231-
val actual = sqlContext.read
231+
val actual = spark.read
232232
.format("jdbc")
233233
.option("url", jdbcUrl)
234234
.option("dbtable", "tbl")
@@ -238,7 +238,7 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
238238
.orderBy("x")
239239
.collect()
240240

241-
val expected = sqlContext.read
241+
val expected = spark.read
242242
.format("jdbc")
243243
.option("url", jdbcUrl)
244244
.option("query", "SELECT x, y FROM tbl ORDER BY x FETCH FIRST 2 ROWS ONLY")
@@ -249,31 +249,31 @@ class DB2IntegrationSuite extends SharedJDBCIntegrationSuite {
249249
}
250250

251251
test("SPARK-48269: boolean type") {
252-
val df = sqlContext.read.jdbc(jdbcUrl, "booleans", new Properties)
252+
val df = spark.read.jdbc(jdbcUrl, "booleans", new Properties)
253253
checkAnswer(df, Row(true))
254254
Seq(true, false).foreach { legacy =>
255255
withSQLConf(SQLConf.LEGACY_DB2_BOOLEAN_MAPPING_ENABLED.key -> legacy.toString) {
256256
val tbl = "booleanscopy" + legacy
257257
df.write.jdbc(jdbcUrl, tbl, new Properties)
258258
if (legacy) {
259-
checkAnswer(sqlContext.read.jdbc(jdbcUrl, tbl, new Properties), Row("1"))
259+
checkAnswer(spark.read.jdbc(jdbcUrl, tbl, new Properties), Row("1"))
260260
} else {
261-
checkAnswer(sqlContext.read.jdbc(jdbcUrl, tbl, new Properties), Row(true))
261+
checkAnswer(spark.read.jdbc(jdbcUrl, tbl, new Properties), Row(true))
262262
}
263263
}
264264
}
265265
}
266266

267267
test("SPARK-48269: GRAPHIC types") {
268-
val df = sqlContext.read.jdbc(jdbcUrl, "graphics", new Properties)
268+
val df = spark.read.jdbc(jdbcUrl, "graphics", new Properties)
269269
checkAnswer(df, Row("a".padTo(16, ' '), "b"))
270270
// the padding happens in the source not because of reading as char type
271271
assert(!df.schema.exists {
272272
_.metadata.contains(CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY) })
273273
}
274274

275275
test("SPARK-48269: binary types") {
276-
val df = sqlContext.read.jdbc(jdbcUrl, "binarys", new Properties)
276+
val df = spark.read.jdbc(jdbcUrl, "binarys", new Properties)
277277
checkAnswer(df, Row(
278278
"ABC".padTo(10, ' ').getBytes,
279279
"ABC".getBytes,

connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ abstract class DockerJDBCIntegrationSuite
210210
assert(response.getState.getRunning)
211211
}
212212
jdbcUrl = db.getJdbcUrl(dockerIp, externalPort)
213+
sleepBeforeTesting()
213214
var conn: Connection = null
214215
eventually(connectionTimeout, interval(1.second)) {
215216
conn = getConnection()
@@ -255,6 +256,11 @@ abstract class DockerJDBCIntegrationSuite
255256
*/
256257
def dataPreparation(connection: Connection): Unit
257258

259+
/**
260+
* Sleep for a while before testing.
261+
*/
262+
def sleepBeforeTesting(): Unit = {}
263+
258264
private def cleanupContainer(): Unit = {
259265
if (docker != null && container != null && !keepContainer) {
260266
try {

connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerKrbJDBCIntegrationSuite.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ abstract class DockerKrbJDBCIntegrationSuite extends DockerJDBCIntegrationSuite
150150
props.setProperty("principal", principal)
151151

152152
val tableName = "write_test"
153-
sqlContext.createDataFrame(Seq(("foo", "bar")))
153+
spark.createDataFrame(Seq(("foo", "bar")))
154154
.write.jdbc(jdbcUrl, tableName, props)
155-
val df = sqlContext.read.jdbc(jdbcUrl, tableName, props)
155+
val df = spark.read.jdbc(jdbcUrl, tableName, props)
156156

157157
val schema = df.schema
158158
assert(schema.map(_.dataType).toSeq === Seq(StringType, StringType))

connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class MariaDBKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite {
4040
override val db = new MariaDBDatabaseOnDocker() {
4141

4242
override def getJdbcUrl(ip: String, port: Int): String =
43-
s"jdbc:mysql://$ip:$port/mysql?user=$principal"
43+
s"jdbc:mysql://$ip:$port/mysql?user=$principal&permitMysqlScheme"
4444

4545
override def beforeContainerStart(
4646
hostConfigBuilder: HostConfig,

connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLDatabaseOnDocker.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
package org.apache.spark.sql.jdbc
1919

2020
class MySQLDatabaseOnDocker extends DatabaseOnDocker {
21-
override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:9.2.0")
21+
override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:9.6.0")
2222
override val env = Map(
2323
"MYSQL_ROOT_PASSWORD" -> "rootpass"
2424
)

0 commit comments

Comments
 (0)