|
2 | 2 | import pytest |
3 | 3 |
|
4 | 4 | from ert.analysis.misfit_preprocessor import ( |
| 5 | + cluster_responses, |
5 | 6 | get_nr_primary_components, |
6 | 7 | get_scaling_factor, |
7 | 8 | main, |
@@ -142,34 +143,33 @@ def test_that_correlated_and_independent_observations_are_grouped_separately( |
142 | 143 | np.testing.assert_equal(obs_errors, obs_error_copy) |
143 | 144 |
|
144 | 145 |
|
145 | | -@pytest.mark.parametrize("nr_observations", [0, 1, 2]) |
146 | | -def test_edge_cases_with_few_observations_return_default_values(nr_observations): |
147 | | - """Test that edge cases with 0-2 observations return default scaling values. |
148 | | - We do not know why this is the case. |
| 146 | +def test_edge_cases_with_few_observations_return_default_values(): |
| 147 | + """ |
| 148 | + Test that edge cases with 2 observations return default scaling values. |
| 149 | + We create an example of a response matrix where all rows are perfectly correlated, |
| 150 | + which should lead to a single cluster with 1 component and a scaling factor of |
| 151 | + sqrt(num_observations / num_components) = sqrt(2). |
| 152 | + However, since the number of observations is <= 2, the function should skip the |
| 153 | + clustering and PCA and return default values of 1.0 for all scaling factors |
149 | 154 | """ |
150 | 155 | nr_realizations = 1000 |
| 156 | + nr_observations = 2 |
151 | 157 | Y = np.ones((nr_observations, nr_realizations)) |
152 | 158 |
|
153 | 159 | rng = np.random.default_rng(1234) |
154 | 160 | parameters_a = rng.normal(10, 1, nr_realizations) |
155 | 161 |
|
| 162 | + # create response matrix |
156 | 163 | for i in range(nr_observations): |
157 | 164 | Y[i] = (i + 1) * parameters_a |
158 | 165 |
|
159 | | - scale_factors, clusters, nr_components = main(Y, Y.mean(axis=1)) |
| 166 | + # Add observation errors |
| 167 | + obs_errors = np.ones(nr_observations) * 0.1 |
160 | 168 |
|
161 | | - np.testing.assert_equal( |
162 | | - scale_factors, |
163 | | - np.array(nr_observations * [1.0]), |
164 | | - ) |
165 | | - |
166 | | - np.testing.assert_equal( |
167 | | - clusters, |
168 | | - np.array(nr_observations * [1.0]), |
169 | | - ) |
| 169 | + scale_factors, *_ = main(Y, obs_errors) |
170 | 170 |
|
171 | 171 | np.testing.assert_equal( |
172 | | - nr_components, |
| 172 | + scale_factors, |
173 | 173 | np.array(nr_observations * [1.0]), |
174 | 174 | ) |
175 | 175 |
|
@@ -340,3 +340,267 @@ def test_autoscale_clusters_observations_by_correlation_pattern_ignoring_sign( |
340 | 340 | expected_scale_factors[cluster_indices] = expected_sf |
341 | 341 |
|
342 | 342 | np.testing.assert_allclose(scale_factors, expected_scale_factors) |
| 343 | + |
| 344 | + |
| 345 | +@pytest.mark.parametrize( |
| 346 | + ("corr_de_target", "expect_de_merged"), |
| 347 | + [ |
| 348 | + (0.55, True), # Above threshold ~0.42: D-E merges first |
| 349 | + (0.50, True), |
| 350 | + (0.45, True), |
| 351 | + (0.40, False), # Below threshold: A-B or B-C merges first |
| 352 | + (0.30, False), |
| 353 | + ], |
| 354 | +) |
| 355 | +def test_that_clustering_prioritizes_global_similarity_over_local_correlation( |
| 356 | + corr_de_target, expect_de_merged |
| 357 | +): |
| 358 | + """ |
| 359 | + This test demonstrates that the current clustering implementation (using |
| 360 | + Euclidean distance on correlation rows) can behave unintuitively by prioritizing |
| 361 | + pairs with LOWER direct correlation for merging over pairs with HIGHER direct |
| 362 | + correlation. This happens when the higher-correlation pair disagrees strongly on |
| 363 | + other variables. |
| 364 | +
|
| 365 | + "Prioritizing" here means that the hierarchical clustering algorithm considers |
| 366 | + the lower-correlation pair to be "closer" (more similar) and thus merges them |
| 367 | + earlier in the bottom-up clustering process. |
| 368 | +
|
| 369 | + Scenario: |
| 370 | + - Group 1: A, B, C. |
| 371 | + A and C are independent. |
| 372 | + B = A + C (correlated ~0.707 with both). |
| 373 | + In other words, A and B are correlated ~0.7, the same is true for B and C. |
| 374 | + However, the Euclidean distance between A's and B's correlation rows is large |
| 375 | + because of the disagreement on C |
| 376 | +
|
| 377 | + - Group 2: D, E. |
| 378 | + D and E are isolated and correlated with strength rho (varied by parametrization). |
| 379 | + They agree perfectly on A, B, C (zero correlation with all). |
| 380 | + The Euclidean distance between D's and E's correlation rows is relatively |
| 381 | + small because they have consistent (zero) correlations with everything else. |
| 382 | +
|
| 383 | + Each variable's correlation row (the corresponding row in the correlation matrix) |
| 384 | + includes its correlation with all variables, including itself (1.0 on diagonal). |
| 385 | + For D and E we have: |
| 386 | +
|
| 387 | + D's row: [corr(D,A)=0, corr(D,B)=0, corr(D,C)=0, corr(D,D)=1.0, corr(D,E)=rho] |
| 388 | + E's row: [corr(E,A)=0, corr(E,B)=0, corr(E,C)=0, corr(E,D)=rho, corr(E,E)=1.0] |
| 389 | +
|
| 390 | + For A, B and C we have: |
| 391 | + A's row: [corr(A,A)=1.0, corr(A,B)=0.7, corr(A,C)=0, corr(A,D)=0, corr(A,E)=0] |
| 392 | + B's row: [corr(B,A)=0.7, corr(B,B)=1.0, corr(B,C)=0.7, corr(B,D)=0, corr(B,E)=0] |
| 393 | + C's row: [corr(C,A)=0, corr(C,B)=0.7, corr(C,C)=1.0, corr(C,D)=0, corr(C,E)=0] |
| 394 | +
|
| 395 | + Threshold calculations (based on Euclidean distance between correlation rows): |
| 396 | + dist(D,E) = sqrt((0-0)^2 + (0-0)^2 + (0-0)^2 + (1-rho)^2 + (rho-1)^2)) |
| 397 | + = sqrt(2 * (1 - rho)^2) |
| 398 | + dist(A,B) = dist(B,C) = sqrt((1-0.7)^2 + (0.7-1)^2 + (0-0.7)^2 + (0-0)^2+(0-0)^2)) |
| 399 | + = 0.82 |
| 400 | +
|
| 401 | + Solve dist(D,E) < dist(A,B) for rho: |
| 402 | + sqrt(2 * (1 - rho)^2) < 0.82 |
| 403 | + 2 * (1 - rho)^2 < 0.82^2 |
| 404 | + (1 - rho)^2 < 0.82^2 / 2 |
| 405 | + 1 - rho < sqrt(0.82^2 / 2) |
| 406 | + rho > 1 - sqrt(0.82^2 / 2) ≈ 0.42 |
| 407 | + Hence, when rho > 0.42, D-E merges first; otherwise either A-B or B-C merges first |
| 408 | + (depending on random variation in the sampling). |
| 409 | +
|
| 410 | + This test is parametrized to verify both regimes: |
| 411 | + - rho > 0.42: D-E merges first despite corr(A,B) > rho and corr(B,C) > rho |
| 412 | + - rho < 0.42: D-E no longer the closest pair (either A-B or B-C merges first) |
| 413 | + """ |
| 414 | + |
| 415 | + rng = np.random.default_rng(42) |
| 416 | + N_realizations = 10000 |
| 417 | + |
| 418 | + # Construct A, B, C |
| 419 | + A = rng.standard_normal(N_realizations) |
| 420 | + C = rng.standard_normal(N_realizations) |
| 421 | + # B = A + C. Normalize everyone. |
| 422 | + A = (A - np.mean(A)) / np.std(A) |
| 423 | + C = (C - np.mean(C)) / np.std(C) |
| 424 | + B = A + C |
| 425 | + B = (B - np.mean(B)) / np.std(B) |
| 426 | + |
| 427 | + # Construct D, E with target correlation |
| 428 | + D = rng.standard_normal(N_realizations) |
| 429 | + noise = rng.standard_normal(N_realizations) |
| 430 | + E = corr_de_target * D + np.sqrt(1 - corr_de_target**2) * noise |
| 431 | + D = (D - np.mean(D)) / np.std(D) |
| 432 | + E = (E - np.mean(E)) / np.std(E) |
| 433 | + |
| 434 | + dataset = np.array([A, B, C, D, E]).T |
| 435 | + |
| 436 | + # Verify correlations |
| 437 | + corr = np.corrcoef(dataset, rowvar=False) |
| 438 | + # The return type of corrcoef is ambiguous (float | ndarray) in stubs, |
| 439 | + # so we assert it is an array to silence static analysis warnings about indexing. |
| 440 | + assert isinstance(corr, np.ndarray) |
| 441 | + corr_AB = corr[0, 1] |
| 442 | + corr_BC = corr[1, 2] |
| 443 | + corr_DE = corr[3, 4] |
| 444 | + |
| 445 | + assert np.isclose(corr_AB, 0.707, atol=0.05), ( |
| 446 | + f"Setup error: A-B corr {corr_AB} != 0.707" |
| 447 | + ) |
| 448 | + assert np.isclose(corr_BC, 0.707, atol=0.05), ( |
| 449 | + f"Setup error: B-C corr {corr_BC} != 0.707" |
| 450 | + ) |
| 451 | + assert np.isclose(corr_DE, corr_de_target, atol=0.05), ( |
| 452 | + f"Setup error: D-E corr {corr_DE} != {corr_de_target}" |
| 453 | + ) |
| 454 | + |
| 455 | + # We ask for a clustering that would force exactly ONE merge. |
| 456 | + # Total items = 5 (A, B, C, D, E). |
| 457 | + # If we ask for 4 clusters, the algorithm must pick the single "best" pair |
| 458 | + # to merge and leave the others as singletons. |
| 459 | + clusters = cluster_responses(dataset, nr_clusters=4) |
| 460 | + |
| 461 | + is_DE_merged = clusters[3] == clusters[4] |
| 462 | + is_AB_merged = clusters[0] == clusters[1] |
| 463 | + is_BC_merged = clusters[1] == clusters[2] |
| 464 | + |
| 465 | + failure_msg = ( |
| 466 | + f"DE_merged={is_DE_merged}, AB_merged={is_AB_merged}, BC_merged={is_BC_merged}." |
| 467 | + f"Correlations: AB={corr_AB:.3f}, BC={corr_BC:.3f}, DE={corr_DE:.3f}" |
| 468 | + ) |
| 469 | + assert is_DE_merged == expect_de_merged, failure_msg |
| 470 | + # When D-E merges first, A-B and B-Cshould not (they stay separate) |
| 471 | + if expect_de_merged: |
| 472 | + assert not is_AB_merged, failure_msg |
| 473 | + assert not is_BC_merged, failure_msg |
| 474 | + |
| 475 | + |
| 476 | +def test_clustering_and_scaling_realistic_scenario(): |
| 477 | + """ |
| 478 | + This is an integration test for two key components of the autoscaler: |
| 479 | +
|
| 480 | + 1. Determining the number of clusters. |
| 481 | + 2. Determining the scaling factor for each cluster. |
| 482 | +
|
| 483 | + Scenario: |
| 484 | + We are given 500 noisy seismic observations and 20 precise well pressure |
| 485 | + observations. All the seismic observations are correlated and all the |
| 486 | + pressure observations are correlated, but the seismic and the pressure observations |
| 487 | + are independent of each other. |
| 488 | +
|
| 489 | + Intuitive behavior: |
| 490 | + The autoscaler should identify two clusters, one for the seismic observations and |
| 491 | + one for the pressure observations, and assign a scaling factor to each cluster based |
| 492 | + on the number of observations in that cluster (sqrt(500) for seismic and sqrt(20) |
| 493 | + for pressure). |
| 494 | +
|
| 495 | + Actual behavior with current implementation: |
| 496 | + One cluster with scaling factor sqrt(520). |
| 497 | +
|
| 498 | + Explanation of clustering: |
| 499 | + The autoscaler uses PCA to determine the number of clusters. When using PCA it is |
| 500 | + common to normalize the data before calculating the principal components |
| 501 | + (e.g., using StandardScaler to give each observation unit variance). However, |
| 502 | + the current implementation scales the responses by the observation errors instead. |
| 503 | + This means that the precise pressure observations are amplified and the noisy |
| 504 | + seismic observations are suppressed. As a result, the PCA identifies only one |
| 505 | + principal component, and hence, everything is grouped into one cluster. |
| 506 | +
|
| 507 | + Explanation of scaling factor: |
| 508 | + The scaling factor for a cluster is calculated as |
| 509 | + sqrt(num_observations_in_cluster / num_components), where num_components is |
| 510 | + calculated by doing PCA on the elements of the cluster (stopping when 95% of the |
| 511 | + total variance is explained). |
| 512 | + """ |
| 513 | + |
| 514 | + rng = np.random.default_rng(42) |
| 515 | + n_realizations = 100 |
| 516 | + |
| 517 | + n_seismic = 500 |
| 518 | + n_pressure = 20 |
| 519 | + |
| 520 | + # Two independent underlying parameters |
| 521 | + param_shallow = rng.normal(0, 1, size=(n_realizations, 1)) |
| 522 | + param_deep = rng.normal(0, 1, size=(n_realizations, 1)) |
| 523 | + |
| 524 | + # Initialize data by using broadcasting to create the correlated structures: |
| 525 | + |
| 526 | + # - Seismic responses = param_shallow * sensitivity, |
| 527 | + # (all 500 seismic obs are linear combinations of the same parameter to create |
| 528 | + # within-group correlation) |
| 529 | + |
| 530 | + # - Pressure responses = param_deep * sensitivity, |
| 531 | + # (all 20 pressure obs are linear combinations of a different parameter) |
| 532 | + |
| 533 | + # - Since param_shallow and param_deep are independent, this results in |
| 534 | + # the seismic responses being independent from the pressure responses. |
| 535 | + |
| 536 | + # Seismic: sensitive to shallow param |
| 537 | + seismic_sensitivity = rng.uniform(0.5, 1.5, size=(1, n_seismic)) |
| 538 | + seismic_responses = param_shallow @ seismic_sensitivity |
| 539 | + # Add noise |
| 540 | + seismic_responses += rng.normal(0, 0.1, size=(n_realizations, n_seismic)) |
| 541 | + |
| 542 | + # Pressure: sensitive to deep param (independent of seismic) |
| 543 | + pressure_sensitivity = rng.uniform(0.5, 1.5, size=(1, n_pressure)) |
| 544 | + pressure_responses = param_deep @ pressure_sensitivity |
| 545 | + # Add noise |
| 546 | + pressure_responses += rng.normal(0, 0.1, size=(n_realizations, n_pressure)) |
| 547 | + |
| 548 | + responses = np.hstack([seismic_responses, pressure_responses]) |
| 549 | + |
| 550 | + # Observation errors: seismic is NOISY, pressure is PRECISE |
| 551 | + seismic_errors = np.full(n_seismic, 2.0) |
| 552 | + pressure_errors = np.full(n_pressure, 0.05) |
| 553 | + obs_errors = np.hstack([seismic_errors, pressure_errors]) |
| 554 | + |
| 555 | + # Run the main function to get clusters and scaling factors |
| 556 | + scale_factors, clusters, _ = main(responses.T, obs_errors) |
| 557 | + |
| 558 | + # Assert that all observations are in the same cluster |
| 559 | + assert len(np.unique(clusters)) == 1 |
| 560 | + |
| 561 | + # Assert that the scaling factor is sqrt(520) for all observations |
| 562 | + # note that this results in the pressure observations receiving a scaling |
| 563 | + # factor of sqrt(520) instead of sqrt(20), which means that they are significantly |
| 564 | + # deflated compared to what one would expect, and essentially ignored/suppressed by |
| 565 | + # the updating algorithm, even though the observations are precise and should be |
| 566 | + # influential. |
| 567 | + expected_sf = np.sqrt(n_seismic + n_pressure) |
| 568 | + assert np.allclose(scale_factors, expected_sf) |
| 569 | + |
| 570 | + |
| 571 | +def test_clustering_and_scaling_edge_case(): |
| 572 | + """ |
| 573 | + This test demonstrates that when observations have irregular errors, the |
| 574 | + current clustering approach can lead to unintuitive results where independent |
| 575 | + observations are clustered together, and treated as correlated. |
| 576 | +
|
| 577 | + Scenario: |
| 578 | + Suppose we have 100 independent observations r_1,r_2,...,r_100 with corresponding |
| 579 | + observation errors, where r_1 has a small error, and r_2,...,r_100 have large |
| 580 | + errors. The error-scaling step amplifies r_1's response and suppresses |
| 581 | + r_2,...,r_100, leading to one cluster instead of 100 clusters. As a result, the |
| 582 | + history matching update is deflated as if all 100 observations were |
| 583 | + perfectly correlated, even though they are all independent. |
| 584 | + """ |
| 585 | + |
| 586 | + # Create 100 independent responses |
| 587 | + rng = np.random.default_rng(42) |
| 588 | + n_observations = 100 |
| 589 | + n_realizations = 1000 |
| 590 | + responses = rng.standard_normal((n_observations, n_realizations)) |
| 591 | + |
| 592 | + # Create irregular observation errors: one small, the rest large |
| 593 | + obs_errors = np.array([0.1] + [10.0] * (n_observations - 1)) |
| 594 | + |
| 595 | + # Run clustering algorithm |
| 596 | + scale_factors, clusters, _ = main(responses, obs_errors) |
| 597 | + |
| 598 | + # Assert that all observations are clustered together (only 1 cluster) |
| 599 | + assert len(np.unique(clusters)) == 1 |
| 600 | + |
| 601 | + # Assert deflation rate |
| 602 | + # For independent observations, the scaling factor should be 1 |
| 603 | + # but since all overvations are clustered together and treated as perfectly |
| 604 | + # correlated, the scaling factor becomes sqrt(100) = 10 |
| 605 | + assert len(np.unique(scale_factors)) == 1 |
| 606 | + assert np.allclose(scale_factors, np.sqrt(100)) |
0 commit comments