|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.streaming |
19 | 19 |
|
| 20 | +import org.apache.hadoop.fs.Path |
20 | 21 | import org.scalactic.source.Position |
21 | 22 | import org.scalatest.Tag |
| 23 | +import org.scalatest.matchers.must.Matchers.be |
| 24 | +import org.scalatest.matchers.should.Matchers.convertToAnyShouldWrapper |
| 25 | +import org.scalatest.time.{Seconds, Span} |
22 | 26 |
|
23 | 27 | import org.apache.spark.SparkUnsupportedOperationException |
24 | 28 | import org.apache.spark.sql.Row |
25 | 29 | import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions |
26 | | -import org.apache.spark.sql.execution.streaming.MemoryStream |
27 | | -import org.apache.spark.sql.execution.streaming.state.{RocksDBStateStoreProvider, StateStoreInvalidValueSchemaEvolution, StateStoreValueSchemaEvolutionThresholdExceeded} |
| 30 | +import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, MemoryStream, MicroBatchExecution} |
| 31 | +import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.DIR_NAME_OFFSETS |
| 32 | +import org.apache.spark.sql.execution.streaming.state.{OperatorStateMetadataV2, RocksDBStateStoreProvider, StateStoreInvalidValueSchemaEvolution, StateStoreValueSchemaEvolutionThresholdExceeded} |
28 | 33 | import org.apache.spark.sql.internal.SQLConf |
29 | 34 | import org.apache.spark.sql.streaming.util.StreamManualClock |
| 35 | +import org.apache.spark.sql.types.StructType |
30 | 36 |
|
31 | 37 | class TransformWithStateAvroSuite extends TransformWithStateSuite { |
32 | 38 |
|
@@ -264,6 +270,190 @@ class TransformWithStateAvroSuite extends TransformWithStateSuite { |
264 | 270 | } |
265 | 271 | } |
266 | 272 |
|
| 273 | + test("transformWithState - verify schema files are retained through multiple evolutions") { |
| 274 | + withSQLConf( |
| 275 | + SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[RocksDBStateStoreProvider].getName, |
| 276 | + SQLConf.SHUFFLE_PARTITIONS.key -> |
| 277 | + TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString, |
| 278 | + SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1", |
| 279 | + SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro") { |
| 280 | + withTempDir { chkptDir => |
| 281 | + val stateOpIdPath = new Path(new Path(chkptDir.getCanonicalPath, "state"), "0") |
| 282 | + val stateSchemaPath = getStateSchemaPath(stateOpIdPath) |
| 283 | + val metadataPath = OperatorStateMetadataV2.metadataDirPath(stateOpIdPath) |
| 284 | + |
| 285 | + // Start with initial basic state schema |
| 286 | + val inputData = MemoryStream[String] |
| 287 | + val result1 = inputData.toDS() |
| 288 | + .groupByKey(x => x) |
| 289 | + .transformWithState(new DefaultValueInitialProcessor(), |
| 290 | + TimeMode.None(), |
| 291 | + OutputMode.Update()) |
| 292 | + |
| 293 | + testStream(result1, OutputMode.Update())( |
| 294 | + StartStream(checkpointLocation = chkptDir.getCanonicalPath), |
| 295 | + AddData(inputData, "a"), |
| 296 | + CheckNewAnswer(("a", BasicState("a".hashCode, "a"))), |
| 297 | + Execute { q => |
| 298 | + eventually(timeout(Span(5, Seconds))) { |
| 299 | + q.asInstanceOf[MicroBatchExecution].arePendingAsyncPurge should be(false) |
| 300 | + } |
| 301 | + }, |
| 302 | + StopStream |
| 303 | + ) |
| 304 | + |
| 305 | + val hadoopConf = spark.sessionState.newHadoopConf() |
| 306 | + val fm = CheckpointFileManager.create(new Path(chkptDir.toString), |
| 307 | + hadoopConf) |
| 308 | + fm.mkdirs(new Path(new Path(chkptDir.toString, DIR_NAME_OFFSETS), |
| 309 | + "dummy_path_name")) |
| 310 | + fm.mkdirs( |
| 311 | + new Path(OperatorStateMetadataV2.metadataDirPath( |
| 312 | + new Path(new Path(new Path(chkptDir.toString), "state"), "0") |
| 313 | + ), |
| 314 | + "dummy_path_name") |
| 315 | + ) |
| 316 | + val dummySchemaPath = |
| 317 | + new Path(stateSchemaPath, "__dummy_file_path") |
| 318 | + fm.mkdirs(dummySchemaPath) |
| 319 | + |
| 320 | + |
| 321 | + // Capture initial schema files (after first schema evolution) |
| 322 | + val initialSchemaFiles = getFiles(stateSchemaPath).length |
| 323 | + assert(initialSchemaFiles > 0, "Expected schema files after initial run") |
| 324 | + |
| 325 | + // Second run with evolved state (adding fields) |
| 326 | + val result2 = inputData.toDS() |
| 327 | + .groupByKey(x => x) |
| 328 | + .transformWithState(new DefaultValueEvolvedProcessor(), |
| 329 | + TimeMode.None(), |
| 330 | + OutputMode.Update()) |
| 331 | + |
| 332 | + testStream(result2, OutputMode.Update())( |
| 333 | + StartStream(checkpointLocation = chkptDir.getCanonicalPath), |
| 334 | + AddData(inputData, "b"), |
| 335 | + CheckNewAnswer(("b", EvolvedState("b".hashCode, "b", 100L, true, 99.9))), |
| 336 | + Execute { q => |
| 337 | + eventually(timeout(Span(5, Seconds))) { |
| 338 | + q.asInstanceOf[MicroBatchExecution].arePendingAsyncPurge should be(false) |
| 339 | + } |
| 340 | + }, |
| 341 | + StopStream |
| 342 | + ) |
| 343 | + |
| 344 | + // Capture schema files after second evolution |
| 345 | + val afterAddingFieldsSchemaFiles = getFiles(stateSchemaPath).length |
| 346 | + assert(afterAddingFieldsSchemaFiles > initialSchemaFiles, |
| 347 | + s"Expected more schema files after adding fields," + |
| 348 | + s" but had $initialSchemaFiles before and $afterAddingFieldsSchemaFiles after") |
| 349 | + |
| 350 | + // Third run with TwoLongs schema |
| 351 | + val result3 = inputData.toDS() |
| 352 | + .groupByKey(x => x) |
| 353 | + .transformWithState(new RunningCountStatefulProcessorTwoLongs(), |
| 354 | + TimeMode.None(), |
| 355 | + OutputMode.Update()) |
| 356 | + |
| 357 | + testStream(result3, OutputMode.Update())( |
| 358 | + StartStream(checkpointLocation = chkptDir.getCanonicalPath), |
| 359 | + AddData(inputData, "c"), |
| 360 | + CheckNewAnswer(("c", "1")), |
| 361 | + Execute { q => |
| 362 | + eventually(timeout(Span(5, Seconds))) { |
| 363 | + q.asInstanceOf[MicroBatchExecution].arePendingAsyncPurge should be(false) |
| 364 | + } |
| 365 | + }, |
| 366 | + StopStream |
| 367 | + ) |
| 368 | + |
| 369 | + // Capture schema files after third evolution |
| 370 | + val afterTwoLongsSchemaFiles = getFiles(stateSchemaPath).length |
| 371 | + assert(afterTwoLongsSchemaFiles > afterAddingFieldsSchemaFiles, |
| 372 | + "Expected more schema files after TwoLongs schema change") |
| 373 | + |
| 374 | + // Fourth run with ReorderedLongs schema |
| 375 | + val result4 = inputData.toDS() |
| 376 | + .groupByKey(x => x) |
| 377 | + .transformWithState(new RunningCountStatefulProcessorReorderedFields(), |
| 378 | + TimeMode.None(), |
| 379 | + OutputMode.Update()) |
| 380 | + |
| 381 | + testStream(result4, OutputMode.Update())( |
| 382 | + StartStream(checkpointLocation = chkptDir.getCanonicalPath), |
| 383 | + AddData(inputData, "d"), |
| 384 | + CheckNewAnswer(("d", "1")), |
| 385 | + Execute { q => |
| 386 | + eventually(timeout(Span(5, Seconds))) { |
| 387 | + q.asInstanceOf[MicroBatchExecution].arePendingAsyncPurge should be(false) |
| 388 | + } |
| 389 | + }, |
| 390 | + StopStream |
| 391 | + ) |
| 392 | + |
| 393 | + // Capture schema files after fourth evolution |
| 394 | + val afterReorderedSchemaFiles = getFiles(stateSchemaPath).length |
| 395 | + assert(afterReorderedSchemaFiles > afterTwoLongsSchemaFiles, |
| 396 | + "Expected more schema files after ReorderedLongs schema change") |
| 397 | + |
| 398 | + // Fifth run with RenamedFields schema |
| 399 | + val result5 = inputData.toDS() |
| 400 | + .groupByKey(x => x) |
| 401 | + .transformWithState(new RenameEvolvedProcessor(), |
| 402 | + TimeMode.None(), |
| 403 | + OutputMode.Update()) |
| 404 | + |
| 405 | + testStream(result5, OutputMode.Update())( |
| 406 | + StartStream(checkpointLocation = chkptDir.getCanonicalPath), |
| 407 | + AddData(inputData, "e"), |
| 408 | + CheckNewAnswer(("e", "1")), |
| 409 | + // Run multiple batches to trigger maintenance |
| 410 | + AddData(inputData, "f"), |
| 411 | + CheckNewAnswer(("f", "1")), |
| 412 | + AddData(inputData, "g"), |
| 413 | + CheckNewAnswer(("g", "1")), |
| 414 | + Execute { q => |
| 415 | + eventually(timeout(Span(5, Seconds))) { |
| 416 | + q.asInstanceOf[MicroBatchExecution].arePendingAsyncPurge should be(false) |
| 417 | + } |
| 418 | + }, |
| 419 | + StopStream |
| 420 | + ) |
| 421 | + |
| 422 | + // Verify metadata files were purged with MIN_BATCHES_TO_RETAIN=1 |
| 423 | + val finalMetadataFiles = getFiles(metadataPath).length |
| 424 | + // We expect the dummy folder and 2 metadata files |
| 425 | + assert(finalMetadataFiles <= 3, |
| 426 | + s"Expected metadata files to be purged to at most 3, but found $finalMetadataFiles") |
| 427 | + |
| 428 | + // Verify schema files were NOT purged despite aggressive metadata purging |
| 429 | + val schemaFiles = getFiles(stateSchemaPath).map(_.getPath.getName) |
| 430 | + val finalSchemaFiles = schemaFiles.length |
| 431 | + assert(finalSchemaFiles >= 5, |
| 432 | + s"Expected at least 5 schema files to be retained" + |
| 433 | + s" (one per schema evolution), but found $finalSchemaFiles") |
| 434 | + assert(schemaFiles.contains(dummySchemaPath.getName)) |
| 435 | + |
| 436 | + // Verify we can read historical state for different batches |
| 437 | + // This should work even though metadata may have been purged for earlier batches |
| 438 | + val latestStateDf = spark.read |
| 439 | + .format("statestore") |
| 440 | + .option(StateSourceOptions.PATH, chkptDir.getAbsolutePath) |
| 441 | + .option(StateSourceOptions.STATE_VAR_NAME, "countState") |
| 442 | + .load() |
| 443 | + |
| 444 | + assert(latestStateDf.count() > 0, "Expected to read current state data") |
| 445 | + |
| 446 | + // Check schema of latest state - should have RenamedFields schema structure |
| 447 | + val latestValueField = latestStateDf.schema.fields.find(_.name == "value").get |
| 448 | + val latestValueType = latestValueField.dataType.asInstanceOf[StructType] |
| 449 | + |
| 450 | + // Should have value4 field from RenamedFields |
| 451 | + assert(latestValueType.fields.exists(f => f.name == "value4"), |
| 452 | + "Expected renamed schema with value4 field") |
| 453 | + } |
| 454 | + } |
| 455 | + } |
| 456 | + |
267 | 457 | test("transformWithState - adding field should succeed") { |
268 | 458 | withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> |
269 | 459 | classOf[RocksDBStateStoreProvider].getName, |
|
0 commit comments