|
| 1 | +=========================================== |
| 2 | +Hybrid Execution (Public Preview) |
| 3 | +=========================================== |
| 4 | + |
| 5 | +Snowpark pandas supports workloads on mixed underlying execution engines and will automatically |
| 6 | +move data to the most appropriate engine for a given dataset size and operation. Currently you |
| 7 | +can use either Snowflake or local pandas to back a DataFrame object. Decisions on when to move |
| 8 | +data are dominated by dataset size. |
| 9 | + |
| 10 | +For Snowflake, specific API calls will trigger hybrid backend evaluation. These are registered |
| 11 | +as either a pre-operation switch point or a post-operation switch point. These switch points |
| 12 | +may change over time as the feature matures and as APIs are updated. |
| 13 | + |
| 14 | +Example Pre-Operation Switchpoints: |
| 15 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 16 | +apply, iterrows, itertuples, items, plot, quantile, __init__, plot, quantile, T, read_csv, read_json, concat, merge |
| 17 | + |
| 18 | +Post-Operation Switchpoints: |
| 19 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 20 | +read_snowflake, value_counts, tail, var, std, sum, sem, max, min, mean, agg, aggregate, count, nunique, cummax, cummin, cumprod, cumsum |
| 21 | + |
| 22 | + |
| 23 | +Examples |
| 24 | +======== |
| 25 | + |
| 26 | +Enabling Hybrid Execution |
| 27 | +~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 28 | + |
| 29 | +.. code-block:: python |
| 30 | +
|
| 31 | + import modin.pandas as pd |
| 32 | + import snowflake.snowpark.modin.plugin |
| 33 | +
|
| 34 | + # Import the configuration variable |
| 35 | + from modin.config import AutoSwitchBackend |
| 36 | + from snowflake.snowpark import Session |
| 37 | + |
| 38 | + Session.builder.create() |
| 39 | + df = pd.DataFrame([1, 2, 3]) |
| 40 | + print(df.get_backend()) # 'Snowflake' |
| 41 | +
|
| 42 | + # Enable hybrid execution |
| 43 | + AutoSwitchBackend().enable() |
| 44 | + df = pd.DataFrame([4, 5, 6]) |
| 45 | + # DataFrame should use local execution backend, 'Pandas' |
| 46 | + # because the data frame is already small and in memory |
| 47 | + print(df.get_backend()) # 'Pandas' |
| 48 | +
|
| 49 | + # Using a configuration context to change behavior |
| 50 | + # within a specific code block |
| 51 | + from modin.config import context as config_context |
| 52 | + with config_context(AutoSwitchBackend=False): |
| 53 | + # perform operations with no switching |
| 54 | + df = pd.DataFrame([[1, 2], [3, 4]]) |
| 55 | +
|
| 56 | + # Disable hybrid execution ( All DataFrames stay on existing engine ) |
| 57 | + AutoSwitchBackend().disable() |
| 58 | +
|
| 59 | +Manually Changing DataFrame Backends |
| 60 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 61 | + |
| 62 | +.. code-block:: python |
| 63 | +
|
| 64 | + # Move a DataFrame to the local machine |
| 65 | + df_local = df.move_to('Pandas') |
| 66 | + # Move a DataFrame to be backed by Snowflake |
| 67 | + df_snow = df_local.move_to('Snowflake') |
| 68 | + # Move a DataFrame to the local machine, without changing the 'df' reference |
| 69 | + df.move_to('Pandas', inplace=True) |
| 70 | + # "pin" the current backend, preventing data movement |
| 71 | + df.pin_backend(inplace=True) |
| 72 | + # "unpin" the current backend, preventing data movement |
| 73 | + df.unpin_backend(inplace=True) |
| 74 | +
|
| 75 | + from modin.config import context as config_context |
| 76 | + with config_context(Backend="Pandas"): |
| 77 | + # Operations only performed using the Pandas backend |
| 78 | + df = pd.DataFrame([4, 5, 6]) |
| 79 | +
|
| 80 | + with config_context(Backend="Snowflake"): |
| 81 | + # Operations only performed using the Snowflake backend |
| 82 | + df = pd.DataFrame([4, 5, 6]) |
| 83 | +
|
| 84 | +Configuring Local Pandas Backend |
| 85 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 86 | + |
| 87 | +Currently the auto switching behavior is dominated by dataset size, with some exceptions |
| 88 | +for specific operations. The default limit for running workloads on the local pandas |
| 89 | +backend is 10M rows. This can be configured through the modin environment variables: |
| 90 | + |
| 91 | +.. code-block:: python |
| 92 | +
|
| 93 | + # Change row threshold to 500k |
| 94 | + from modin.config.envvars import NativePandasMaxRows |
| 95 | + from modin.config import context as config_context |
| 96 | +
|
| 97 | + NativePandasMaxRows.put(500_000) |
| 98 | +
|
| 99 | + # Use a config context to set the Pandas backend parameters |
| 100 | + with config_context(NativePandasMaxRows=1234): |
| 101 | + # Operations only performed using the Pandas backend |
| 102 | + df = pd.DataFrame([4, 5, 6]) |
0 commit comments