Skip to content

Commit 5dbd994

Browse files
authored
feat: support dask and spark dataframes in evaluate (#121)
1 parent 2d016ed commit 5dbd994

File tree

12 files changed

+435
-74
lines changed

12 files changed

+435
-74
lines changed

.github/workflows/build-docs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
set -ux
3333
python -m pip install --upgrade pip
3434
pip install -Uq nbdev
35-
pip install -e ".[dev]"
35+
pip install ".[dev]" fugue[dask,spark]>=0.8.1
3636
mkdir nbs/_extensions
3737
cp -r docs-scripts/mintlify/ nbs/_extensions/
3838
python docs-scripts/update-quarto.py

.github/workflows/ci.yaml

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,56 +7,48 @@ on:
77
branches: [main]
88
workflow_dispatch:
99

10-
defaults:
11-
run:
12-
shell: bash -l {0}
13-
1410
concurrency:
1511
group: ${{ github.workflow }}-${{ github.ref }}
1612
cancel-in-progress: true
1713

1814
jobs:
1915
all-tests:
20-
runs-on: ${{ matrix.os }}
16+
runs-on: ubuntu-latest
2117
strategy:
2218
fail-fast: false
2319
matrix:
24-
os: [macos-latest, ubuntu-latest]
25-
python-version: ['3.8', '3.9', '3.10', '3.11']
20+
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
2621
steps:
2722
- name: Clone repo
2823
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
2924

30-
- name: Set up environment
31-
uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822 # v1.9.0
25+
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
3226
with:
33-
environment-file: environment.yml
34-
create-args: python=${{ matrix.python-version }}
35-
cache-environment: true
27+
python-version: ${{ matrix.python-version }}
3628

3729
- name: Install the library
38-
run: pip install ./
30+
run: pip install uv && uv pip install --system ".[dev]" fugue[dask,spark]>=0.8.1
3931

4032
- name: Run tests
41-
run: nbdev_test --do_print --timing --flags 'matplotlib polars pyarrow scipy'
33+
run: nbdev_test --do_print --timing --flags 'datasets distributed matplotlib polars pyarrow scipy'
4234

43-
windows-tests:
44-
runs-on: windows-latest
35+
local-tests:
36+
runs-on: ${{ matrix.os }}
4537
strategy:
4638
fail-fast: false
4739
matrix:
48-
python-version: ['3.8', '3.9', '3.10', '3.11']
40+
os: [macos-latest, windows-latest]
41+
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
4942
steps:
5043
- name: Clone repo
5144
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
5245

53-
- name: Set up environment
54-
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
46+
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
5547
with:
5648
python-version: ${{ matrix.python-version }}
5749

5850
- name: Install the library
59-
run: pip install uv && uv pip install ".[dev]" --system
51+
run: pip install uv && uv pip install --system ".[dev]"
6052

6153
- name: Run tests
6254
run: nbdev_test --do_print --timing --flags 'datasets matplotlib polars pyarrow scipy'
@@ -75,11 +67,10 @@ jobs:
7567
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
7668
with:
7769
python-version: '3.10'
78-
cache: 'pip'
7970

8071
- name: Install dependencies
8172
shell: bash
82-
run: pip3 install . nbdev
73+
run: pip install . nbdev
8374

8475
- name: Run tests
8576
shell: bash

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ repos:
1515
hooks:
1616
- id: mypy
1717
args: [--ignore-missing-imports]
18+
exclude: 'setup.py'

action_files/clean_nbs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
nbdev_clean
2+
./action_files/remove_logs_cells

action_files/remove_logs_cells

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python3
2+
import re
3+
from pathlib import Path
4+
from nbdev.clean import process_write
5+
6+
IP_REGEX = re.compile(r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')
7+
HOURS_REGEX = re.compile(r'\d{2}:\d{2}:\d{2}')
8+
9+
def cell_contains_ips(cell):
10+
if 'outputs' not in cell:
11+
return False
12+
for output in cell['outputs']:
13+
if 'text' not in output:
14+
return False
15+
for line in output['text']:
16+
if IP_REGEX.search(line) or HOURS_REGEX.search(line) or '[LightGBM]' in line:
17+
return True
18+
return False
19+
20+
21+
def clean_nb(nb):
22+
for cell in nb['cells']:
23+
if cell_contains_ips(cell):
24+
cell['outputs'] = []
25+
26+
27+
if __name__ == '__main__':
28+
repo_root = Path(__file__).parents[1]
29+
for nb in (repo_root / 'nbs').glob('*.ipynb'):
30+
process_write(warn_msg='Failed to clean_nb', proc_nb=clean_nb, f_in=nb)

nbs/compat.ipynb

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,30 @@
9595
" return f(*args, **kwargs)\n",
9696
" return wrapper\n",
9797
"\n",
98+
"try:\n",
99+
" from dask.dataframe import DataFrame as DaskDataFrame\n",
100+
"except ModuleNotFoundError:\n",
101+
" pass\n",
102+
"\n",
103+
"try:\n",
104+
" from pyspark.sql import DataFrame as SparkDataFrame\n",
105+
"except ModuleNotFoundError:\n",
106+
" pass\n",
107+
"\n",
98108
"DataFrame = Union[pd.DataFrame, pl_DataFrame]\n",
99-
"Series = Union[pd.Series, pl_Series]"
109+
"Series = Union[pd.Series, pl_Series]\n",
110+
"DistributedDFType = TypeVar(\n",
111+
" \"DistributedDFType\",\n",
112+
" \"DaskDataFrame\",\n",
113+
" \"SparkDataFrame\",\n",
114+
")\n",
115+
"AnyDFType = TypeVar(\n",
116+
" \"AnyDFType\",\n",
117+
" \"DaskDataFrame\",\n",
118+
" pd.DataFrame,\n",
119+
" \"pl_DataFrame\",\n",
120+
" \"SparkDataFrame\",\n",
121+
")"
100122
]
101123
}
102124
],

0 commit comments

Comments
 (0)