Skip to content

Commit f8fcfbd

Browse files
authored
Merge pull request #169 from singjc/refactor/make_pyarrow_pq_optional
Refactor/make pyarrow pq optional
2 parents c81934f + f0e0025 commit f8fcfbd

File tree

14 files changed

+241
-121
lines changed

14 files changed

+241
-121
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ dependencies = [
4242
"scikit-learn >= 1.5",
4343
"xgboost-cpu >= 2.1.4", # regular xgboost includes nvidia libraries which bloat the package size on linux. For PyProphet, we likely would not need GPU support.
4444
"matplotlib",
45-
"pyarrow",
4645
"pypdf",
4746
"psutil",
4847
"pyopenms",
@@ -55,6 +54,7 @@ dependencies = [
5554
testing = ["pytest", "pytest-regtest", "pytest-xdist"]
5655
docs = ["sphinx", "sphinx-copybutton", "sphinx_rtd_theme", "pydata_sphinx_theme", "sphinx-click"]
5756
dev = ["pyprophet[testing]", "pyprophet[docs]", "black", "ruff", "mypy"]
57+
parquet = ["pyarrow"]
5858

5959
# Define console entry points
6060
[project.scripts]

pyprophet/io/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
Dependencies:
1616
-------------
1717
- `pandas`
18-
- `pyarrow`
18+
- `pyarrow`(optional, for Parquet support)
1919
- `duckdb`
2020
- `sqlite3`
2121
- `loguru`

pyprophet/io/dispatcher.py

Lines changed: 12 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,52 +16,28 @@
1616

1717
from loguru import logger
1818

19+
from .util import (
20+
_get_parquet_reader_class_for_config,
21+
_get_parquet_writer_class_for_config,
22+
)
1923
from .._config import ExportIOConfig, IPFIOConfig, LevelContextIOConfig, RunnerIOConfig
2024

2125
# Export I/O
2226
from .export.osw import OSWReader as ExportOSWReader
2327
from .export.osw import OSWWriter as ExportOSWWriter
2428
from .export.sqmass import SqMassWriter as ExportSqMassWriter
25-
from .export.parquet import (
26-
ParquetReader as ExportParquetReader,
27-
)
28-
from .export.parquet import (
29-
ParquetWriter as ExportParquetWriter,
30-
)
31-
from .export.split_parquet import (
32-
SplitParquetReader as ExportSplitParquetReader,
33-
)
34-
from .export.split_parquet import (
35-
SplitParquetWriter as ExportSplitParquetWriter,
36-
)
3729

3830
# IPF I/O
3931
from .ipf.osw import OSWReader as IPFOSWReader
4032
from .ipf.osw import OSWWriter as IPFOSWWriter
41-
from .ipf.parquet import ParquetReader as IPFParquetReader
42-
from .ipf.parquet import ParquetWriter as IPFParquetWriter
43-
from .ipf.split_parquet import SplitParquetReader as IPFSplitParquetReader
44-
from .ipf.split_parquet import SplitParquetWriter as IPFSplitParquetWriter
4533

4634
# Levels Context I/O
4735
from .levels_context.osw import OSWReader as LevelContextOSWReader
4836
from .levels_context.osw import OSWWriter as LevelContextOSWWriter
49-
from .levels_context.parquet import ParquetReader as LevelContextParquetReader
50-
from .levels_context.parquet import ParquetWriter as LevelContextParquetWriter
51-
from .levels_context.split_parquet import (
52-
SplitParquetReader as LevelContextSplitParquetReader,
53-
)
54-
from .levels_context.split_parquet import (
55-
SplitParquetWriter as LevelContextSplitParquetWriter,
56-
)
5737

5838
# Scoring I/O
5939
from .scoring.osw import OSWReader as ScoringOSWReader
6040
from .scoring.osw import OSWWriter as ScoringOSWWriter
61-
from .scoring.parquet import ParquetReader as ParquetScoringReader
62-
from .scoring.parquet import ParquetWriter as ParquetScoringWriter
63-
from .scoring.split_parquet import SplitParquetReader as SplitParquetScoringReader
64-
from .scoring.split_parquet import SplitParquetWriter as SplitParquetScoringWriter
6541
from .scoring.tsv import TSVReader as ScoringTSVReader
6642
from .scoring.tsv import TSVWriter as ScoringTSVWriter
6743

@@ -123,29 +99,13 @@ def _get_osw_reader(config):
12399

124100
@staticmethod
125101
def _get_parquet_reader(config):
126-
if isinstance(config, RunnerIOConfig):
127-
return ParquetScoringReader(config)
128-
elif isinstance(config, IPFIOConfig):
129-
return IPFParquetReader(config)
130-
elif isinstance(config, LevelContextIOConfig):
131-
return LevelContextParquetReader(config)
132-
elif isinstance(config, ExportIOConfig):
133-
return ExportParquetReader(config)
134-
else:
135-
raise ValueError(f"Unsupported config context: {type(config).__name__}")
102+
cls = _get_parquet_reader_class_for_config(config, split=False)
103+
return cls(config)
136104

137105
@staticmethod
138106
def _get_split_parquet_reader(config):
139-
if isinstance(config, RunnerIOConfig):
140-
return SplitParquetScoringReader(config)
141-
elif isinstance(config, IPFIOConfig):
142-
return IPFSplitParquetReader(config)
143-
elif isinstance(config, LevelContextIOConfig):
144-
return LevelContextSplitParquetReader(config)
145-
elif isinstance(config, ExportIOConfig):
146-
return ExportSplitParquetReader(config)
147-
else:
148-
raise ValueError(f"Unsupported config context: {type(config).__name__}")
107+
cls = _get_parquet_reader_class_for_config(config, split=True)
108+
return cls(config)
149109

150110
@staticmethod
151111
def _get_tsv_reader(config):
@@ -223,29 +183,13 @@ def _get_sqmass_writer(config):
223183

224184
@staticmethod
225185
def _get_parquet_writer(config):
226-
if isinstance(config, RunnerIOConfig):
227-
return ParquetScoringWriter(config)
228-
elif isinstance(config, IPFIOConfig):
229-
return IPFParquetWriter(config)
230-
elif isinstance(config, LevelContextIOConfig):
231-
return LevelContextParquetWriter(config)
232-
elif isinstance(config, ExportIOConfig):
233-
return ExportParquetWriter(config)
234-
else:
235-
raise ValueError(f"Unsupported config context: {type(config).__name__}")
186+
cls = _get_parquet_writer_class_for_config(config, split=False)
187+
return cls(config)
236188

237189
@staticmethod
238190
def _get_split_parquet_writer(config):
239-
if isinstance(config, RunnerIOConfig):
240-
return SplitParquetScoringWriter(config)
241-
elif isinstance(config, IPFIOConfig):
242-
return IPFSplitParquetWriter(config)
243-
elif isinstance(config, LevelContextIOConfig):
244-
return LevelContextSplitParquetWriter(config)
245-
elif isinstance(config, ExportIOConfig):
246-
return ExportSplitParquetWriter(config)
247-
else:
248-
raise ValueError(f"Unsupported config context: {type(config).__name__}")
191+
cls = _get_parquet_writer_class_for_config(config, split=True)
192+
return cls(config)
249193

250194
@staticmethod
251195
def _get_tsv_writer(config):

pyprophet/io/ipf/parquet.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import os
2-
from typing import Literal
32
from shutil import copyfile
4-
import pandas as pd
5-
import pyarrow as pa
6-
import duckdb
3+
from typing import Literal
4+
75
import click
6+
import duckdb
7+
import pandas as pd
88
from loguru import logger
9-
from ..util import get_parquet_column_names
10-
from .._base import BaseParquetReader, BaseParquetWriter
9+
1110
from ..._config import IPFIOConfig
11+
from .._base import BaseParquetReader, BaseParquetWriter
12+
from ..util import _ensure_pyarrow, get_parquet_column_names
13+
14+
pa, _, _ = _ensure_pyarrow()
1215

1316

1417
class ParquetReader(BaseParquetReader):

pyprophet/io/ipf/split_parquet.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
import os
21
import glob
3-
from shutil import copyfile
2+
import os
43
from typing import Literal
5-
import pandas as pd
6-
import pyarrow as pa
7-
import duckdb
4+
85
import click
6+
import duckdb
7+
import pandas as pd
98
from loguru import logger
109

11-
from ..util import get_parquet_column_names
12-
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
1310
from ..._config import IPFIOConfig
11+
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
12+
from ..util import _ensure_pyarrow, get_parquet_column_names
13+
14+
pa, _, _ = _ensure_pyarrow()
1415

1516

1617
class SplitParquetReader(BaseSplitParquetReader):

pyprophet/io/levels_context/parquet.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
import os
2-
from typing import Literal
31
from shutil import copyfile
4-
import pandas as pd
5-
import pyarrow as pa
6-
import duckdb
2+
73
import click
4+
import duckdb
5+
import pandas as pd
86
from loguru import logger
9-
from ..util import get_parquet_column_names
10-
from .._base import BaseParquetReader, BaseParquetWriter
7+
118
from ..._config import LevelContextIOConfig
9+
from .._base import BaseParquetReader, BaseParquetWriter
10+
from ..util import _ensure_pyarrow, get_parquet_column_names
11+
12+
pa, _, _ = _ensure_pyarrow()
1213

1314

1415
class ParquetReader(BaseParquetReader):

pyprophet/io/levels_context/split_parquet.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1-
import os
21
import glob
3-
from shutil import copyfile
4-
from typing import Literal
5-
import pandas as pd
6-
import pyarrow as pa
7-
import duckdb
2+
import os
3+
84
import click
5+
import duckdb
6+
import pandas as pd
97
from loguru import logger
108

11-
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
129
from ..._config import LevelContextIOConfig
13-
from ..util import (
14-
get_parquet_column_names,
15-
)
10+
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
11+
from ..util import _ensure_pyarrow, get_parquet_column_names
12+
13+
pa, _, _ = _ensure_pyarrow()
1614

1715

1816
class SplitParquetReader(BaseSplitParquetReader):

pyprophet/io/scoring/parquet.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import sys
22
from shutil import copyfile
3+
4+
import click
5+
import duckdb
36
import pandas as pd
47
import polars as pl
5-
import pyarrow as pa
6-
import duckdb
7-
import click
88
from loguru import logger
9-
from ..util import get_parquet_column_names
10-
from .._base import BaseParquetReader, BaseParquetWriter, RowCountMismatchError
9+
1110
from ..._config import RunnerIOConfig
11+
from .._base import BaseParquetReader, BaseParquetWriter, RowCountMismatchError
12+
from ..util import _ensure_pyarrow, get_parquet_column_names
13+
14+
pa, _, _ = _ensure_pyarrow()
1215

1316

1417
class ParquetReader(BaseParquetReader):

pyprophet/io/scoring/split_parquet.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
import os
2-
import sys
3-
import glob
4-
from shutil import copyfile
5-
import pandas as pd
6-
import pyarrow as pa
7-
import duckdb
2+
83
import click
4+
import duckdb
5+
import pandas as pd
96
from loguru import logger
107

11-
from ..util import get_parquet_column_names
12-
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
138
from ..._config import RunnerIOConfig
9+
from .._base import BaseSplitParquetReader, BaseSplitParquetWriter
10+
from ..util import _ensure_pyarrow, get_parquet_column_names
11+
12+
pa, _, _ = _ensure_pyarrow()
1413

1514

1615
class SplitParquetReader(BaseSplitParquetReader):

0 commit comments

Comments
 (0)