-
Notifications
You must be signed in to change notification settings - Fork 183
Add NearestNeighbors SPMD API #2557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
7213c80
7aea5a6
fa48719
3765c6c
b3c66af
ced4aca
72fc707
ca9408b
e8c1ed9
eca8bff
47a7d93
a56ee49
544cca3
8532fe3
b9eb2df
ae7ade0
19cde34
ffba570
0b7777e
5faaa8e
466e195
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,9 +24,9 @@ | |
) | ||
from sklearnex import config_context | ||
from sklearnex.tests.utils.spmd import ( | ||
_assert_unordered_allclose, | ||
_generate_classification_data, | ||
_generate_regression_data, | ||
_generate_statistic_data, | ||
_get_local_tensor, | ||
_mpi_libs_and_gpu_available, | ||
_spmd_assert_allclose, | ||
|
@@ -94,8 +94,8 @@ def test_knncls_spmd_gold(dataframe, queue): | |
spmd_result = spmd_model.predict(local_dpt_X_test) | ||
batch_result = batch_model.predict(X_test) | ||
|
||
_assert_unordered_allclose(spmd_indcs, batch_indcs, localize=True) | ||
_assert_unordered_allclose(spmd_dists, batch_dists, localize=True) | ||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists) | ||
_spmd_assert_allclose(spmd_result, batch_result) | ||
|
||
|
||
|
@@ -164,10 +164,8 @@ def test_knncls_spmd_synthetic( | |
|
||
tol = 1e-4 | ||
if dtype == np.float64: | ||
_assert_unordered_allclose(spmd_indcs, batch_indcs, localize=True) | ||
_assert_unordered_allclose( | ||
spmd_dists, batch_dists, localize=True, rtol=tol, atol=tol | ||
) | ||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists, rtol=tol, atol=tol) | ||
_spmd_assert_allclose(spmd_result, batch_result) | ||
|
||
|
||
|
@@ -231,8 +229,8 @@ def test_knnreg_spmd_gold(dataframe, queue): | |
spmd_result = spmd_model.predict(local_dpt_X_test) | ||
batch_result = batch_model.predict(X_test) | ||
|
||
_assert_unordered_allclose(spmd_indcs, batch_indcs, localize=True) | ||
_assert_unordered_allclose(spmd_dists, batch_dists, localize=True) | ||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists) | ||
_spmd_assert_allclose(spmd_result, batch_result) | ||
|
||
|
||
|
@@ -303,8 +301,133 @@ def test_knnreg_spmd_synthetic( | |
|
||
tol = 0.005 if dtype == np.float32 else 1e-4 | ||
if dtype == np.float64: | ||
_assert_unordered_allclose(spmd_indcs, batch_indcs, localize=True) | ||
_assert_unordered_allclose( | ||
spmd_dists, batch_dists, localize=True, rtol=tol, atol=tol | ||
) | ||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists, rtol=tol, atol=tol) | ||
_spmd_assert_allclose(spmd_result, batch_result, rtol=tol, atol=tol) | ||
|
||
|
||
@pytest.mark.skipif( | ||
not _mpi_libs_and_gpu_available, | ||
reason="GPU device and MPI libs required for test", | ||
) | ||
@pytest.mark.parametrize( | ||
"dataframe,queue", | ||
get_dataframes_and_queues(dataframe_filter_="dpnp,dpctl", device_filter_="gpu"), | ||
) | ||
@pytest.mark.mpi | ||
def test_knnsearch_spmd_gold(dataframe, queue): | ||
# Import spmd and batch algo | ||
from sklearnex.neighbors import NearestNeighbors as NearestNeighbors_Batch | ||
from sklearnex.spmd.neighbors import NearestNeighbors as NearestNeighbors_SPMD | ||
|
||
# Create gold data and convert to dataframe | ||
X_train = np.array( | ||
[[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [10, 10], [9, 9]] | ||
) | ||
local_dpt_X_train = _convert_to_dataframe( | ||
_get_local_tensor(X_train), sycl_queue=queue, target_df=dataframe | ||
) | ||
|
||
# Ensure predictions of batch algo match spmd | ||
spmd_model = NearestNeighbors_SPMD(n_neighbors=2, algorithm="brute").fit( | ||
local_dpt_X_train | ||
) | ||
batch_model = NearestNeighbors_Batch(n_neighbors=2, algorithm="brute").fit(X_train) | ||
spmd_dists, spmd_indcs = spmd_model.kneighbors(local_dpt_X_train) | ||
batch_dists, batch_indcs = batch_model.kneighbors(X_train) | ||
|
||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists) | ||
|
||
|
||
@pytest.mark.skipif( | ||
not _mpi_libs_and_gpu_available, | ||
reason="GPU device and MPI libs required for test", | ||
) | ||
@pytest.mark.parametrize( | ||
"dimensions", [{"n": 100, "m": 10, "k": 2}, {"n": 100000, "m": 100, "k": 100}] | ||
ethanglaser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
@pytest.mark.parametrize( | ||
"dataframe,queue", | ||
get_dataframes_and_queues(dataframe_filter_="dpnp,dpctl", device_filter_="gpu"), | ||
) | ||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
@pytest.mark.mpi | ||
def test_knnsearch_spmd_synthetic( | ||
dimensions, | ||
dataframe, | ||
queue, | ||
dtype, | ||
): | ||
if dimensions["n"] > 10000 and dtype == np.float32: | ||
pytest.skip("Skipping large float32 test due to expected precision issues") | ||
|
||
# Import spmd and batch algo | ||
from sklearnex.neighbors import NearestNeighbors as NearestNeighbors_Batch | ||
from sklearnex.spmd.neighbors import NearestNeighbors as NearestNeighbors_SPMD | ||
|
||
# Generate data and convert to dataframe | ||
X_train = _generate_statistic_data(dimensions["n"], dimensions["m"], dtype=dtype) | ||
|
||
local_dpt_X_train = _convert_to_dataframe( | ||
_get_local_tensor(X_train), sycl_queue=queue, target_df=dataframe | ||
) | ||
|
||
# Ensure search results of batch algo match spmd | ||
spmd_model = NearestNeighbors_SPMD( | ||
n_neighbors=dimensions["k"], algorithm="brute" | ||
).fit(local_dpt_X_train) | ||
batch_model = NearestNeighbors_Batch( | ||
n_neighbors=dimensions["k"], algorithm="brute" | ||
).fit(X_train) | ||
spmd_dists, spmd_indcs = spmd_model.kneighbors(local_dpt_X_train) | ||
batch_dists, batch_indcs = batch_model.kneighbors(X_train) | ||
|
||
tol = 0.005 if dtype == np.float32 else 1e-6 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yikes on this float32 setting. Any info on it? Especially because there is a skip associated with it above (meaning an even worse value occurs?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's true, and good observation. It's pretty tricky because this assert all close functionality will fail even if a single element is not within the threshold, hence why it is so loose - it would be nice if there was some sort of customization of that. It's possible that we could still run the indices check for this case, but distances are more fragile. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not the only place in spmd test scope where drastically low thresholds are needed to support float32 tests passing though |
||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists, rtol=tol, atol=tol) | ||
|
||
|
||
@pytest.mark.skipif( | ||
not _mpi_libs_and_gpu_available, | ||
reason="GPU device and MPI libs required for test", | ||
) | ||
@pytest.mark.parametrize( | ||
"dataframe,queue", | ||
get_dataframes_and_queues(dataframe_filter_="dpnp,dpctl", device_filter_="gpu"), | ||
) | ||
@pytest.mark.mpi | ||
def test_knn_spmd_empty_kneighbors(dataframe, queue): | ||
# Import spmd and batch algo | ||
from sklearnex.neighbors import NearestNeighbors as NearestNeighbors_Batch | ||
from sklearnex.spmd.neighbors import ( | ||
KNeighborsClassifier, | ||
KNeighborsRegressor, | ||
NearestNeighbors, | ||
) | ||
|
||
# Create gold data and convert to dataframe | ||
X_train = np.array( | ||
[[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [10, 10], [9, 9]] | ||
) | ||
y_train = np.array([0, 1, 0, 1, 0, 1, 0, 1]) | ||
local_dpt_X_train = _convert_to_dataframe( | ||
_get_local_tensor(X_train), sycl_queue=queue, target_df=dataframe | ||
) | ||
local_dpt_y_train = _convert_to_dataframe( | ||
_get_local_tensor(y_train), sycl_queue=queue, target_df=dataframe | ||
) | ||
|
||
# Run each estimator without an input to kneighbors() and ensure functionality and equivalence | ||
for CurrentEstimator in [KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see why this was done, but is a bit painful to analyze if there is a failure. Ideally it would be parametrized over, but really isn't possible by the way it is imported. Would be worth adding some sort of message to figure out which is the CurrentEstimator (rather than having to dig through the pytest log for the CurrentEstimator current value was). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah - I am pretty open to ideas on this one. The loop is great because I run the exact same test on all 3 classes, but you are correct that analysis on a fail is trickier. I think scikit-learn may do things like this, I could check how they do it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess its easier there because in sklearn they import at top of file |
||
spmd_model = CurrentEstimator(n_neighbors=1, algorithm="brute").fit( | ||
local_dpt_X_train, local_dpt_y_train | ||
) | ||
batch_model = NearestNeighbors_Batch(n_neighbors=1, algorithm="brute").fit( | ||
X_train, y_train | ||
) | ||
spmd_dists, spmd_indcs = spmd_model.kneighbors() | ||
batch_dists, batch_indcs = batch_model.kneighbors() | ||
|
||
_spmd_assert_allclose(spmd_indcs, batch_indcs) | ||
_spmd_assert_allclose(spmd_dists, batch_dists) |
Uh oh!
There was an error while loading. Please reload this page.