diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 0d766bc..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,70 +0,0 @@ -version: 2 -jobs: - build: - docker: - - image: cimg/python:3.10.5 - - working_directory: ~/repo - - steps: - - checkout - - - restore_cache: - keys: - - v3-dependencies-{{ checksum "requirements-dev.txt" }} - - v3-dependencies- - - - run: - name: Install pandoc - command: | - sudo apt-get update - wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb - sudo dpkg -i pandoc-2.18-1-amd64.deb - - - run: - name: Install 7z, unrar - command: | - sudo apt-get install -y p7zip-full - - - run: - name: Install InkScape - command: | - sudo apt-get install -y inkscape - - - run: - name: Install graphviz - command: | - sudo apt-get install -y graphviz - - - run: - name: install dependencies (2) - command: | - pip install -r requirements-dev.txt - - - save_cache: - paths: - - ./venv - key: v3-dependencies-{{ checksum "requirements-dev.txt" }} - - - run: - name: compile and build - command: | - python setup.py build_ext --inplace - - - run: - name: run tests - command: | - python -m pytest - - - run: - name: wheel - command: | - python setup.py bdist_wheel - mkdir -p test-reports/dist - cp dist/*.whl test-reports/dist - mkdir -p test-reports/src - cp -r pandas_streaming test-reports/src - - - store_artifacts: - path: test-reports - destination: test-reports \ No newline at end of file diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index a7a5be1..344f1b0 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -77,7 +77,7 @@ jobs: grep ERROR doc.txt exit 1 fi - if [[ $(grep WARNING doc.txt) ]]; then + if [[ $(grep WARNING doc.txt | grep -v 'std:term:y') ]]; then echo "Documentation produces warnings." grep WARNING doc.txt exit 1 diff --git a/.github/workflows/rstcheck.yml b/.github/workflows/rstcheck.yml deleted file mode 100644 index 44e2a48..0000000 --- a/.github/workflows/rstcheck.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: RST Check - -on: [push, pull_request] - -jobs: - build_wheels: - name: rstcheck ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install requirements - run: python -m pip install -r requirements.txt - - - name: Install rstcheck - run: python -m pip install sphinx tomli rstcheck[toml,sphinx] - - - name: rstcheck - run: rstcheck -r _doc pandas_streaming diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 040a297..e25444d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -24,7 +24,7 @@ jobs: - script: pip install -r requirements-dev.txt displayName: 'Install Requirements dev' - script: | - ruff . + ruff check . displayName: 'Ruff' - script: | black --diff . @@ -76,11 +76,8 @@ jobs: - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn displayName: 'Install scikit-learn nightly' - script: | - ruff . + ruff check . displayName: 'Ruff' - - script: | - rstcheck -r ./_doc ./pandas_streaming - displayName: 'rstcheck' - script: | black --diff . displayName: 'Black' @@ -117,11 +114,8 @@ jobs: - script: pip install -r requirements-dev.txt displayName: 'Install Requirements dev' - script: | - ruff . + ruff check . displayName: 'Ruff' - - script: | - rstcheck -r ./_doc ./pandas_streaming - displayName: 'rstcheck' - script: | black --diff . displayName: 'Black' diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index 1df251f..ce9a3a2 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -2,7 +2,6 @@ from logging import getLogger import pandas import numpy -from sklearn.model_selection import train_test_split from .dataframe_helpers import dataframe_shuffle logger = getLogger("pandas-streaming") @@ -61,6 +60,8 @@ def train_test_split_weights( raise ValueError( f"test_size={test_size} or train_size={train_size} cannot be null (1)." ) + from sklearn.model_selection import train_test_split + return train_test_split( df, test_size=test_size, train_size=train_size, random_state=random_state ) diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index 1cc87a2..cc03ab3 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -640,10 +640,10 @@ def _reservoir_sampling( if len(indices) < n: indices.append((i, ir)) else: - x = nrandom.random() # pylint: disable=E1101 + x = nrandom.random() if x * n < (seen - n): k = nrandom.randint(0, len(indices) - 1) - indices[k] = (i, ir) # pylint: disable=E1126 + indices[k] = (i, ir) indices = set(indices) def reservoir_iterate(sdf, indices, chunksize): diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py index b9e58c7..748b5ec 100644 --- a/pandas_streaming/df/dataframe_helpers.py +++ b/pandas_streaming/df/dataframe_helpers.py @@ -25,11 +25,9 @@ def numpy_types(): numpy.uint16, numpy.uint32, numpy.uint64, - numpy.float_, numpy.float16, numpy.float32, numpy.float64, - numpy.complex_, numpy.complex64, numpy.complex128, ] @@ -155,13 +153,13 @@ def hash_floatl(c): } # pylint: disable=R1721 for c in cols: t = coltype[c] - if t == int: + if t == int: # noqa: E721 df[c] = df[c].apply(hash_intl) elif t == numpy.int64: df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x))) - elif t == float: + elif t == float: # noqa: E721 df[c] = df[c].apply(hash_floatl) - elif t == object: + elif t == object: # noqa: E721 df[c] = df[c].apply(hash_strl) else: raise NotImplementedError( # pragma: no cover diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index 8c00ba2..5cf135e 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -5,7 +5,6 @@ from ujson import dumps except ImportError: # pragma: no cover from json import dumps -import ijson class JsonPerRowsStream: @@ -257,6 +256,8 @@ def enumerate_json_items( else: if hasattr(filename, "seek"): filename.seek(0) + import ijson + parser = ijson.parse(filename) current = None curkey = None diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py index 7c2d191..0e068a3 100644 --- a/pandas_streaming/df/dataframe_split.py +++ b/pandas_streaming/df/dataframe_split.py @@ -45,7 +45,7 @@ def sklearn_train_test_split( ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ImportWarning) - from sklearn.model_selection import train_test_split # pylint: disable=C0415 + from sklearn.model_selection import train_test_split opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"] split_ops = {} diff --git a/pyproject.toml b/pyproject.toml index c1472ad..495bbb3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,3 @@ -[tool.rstcheck] -report_level = "INFO" -ignore_directives = [ - "autoclass", - "autofunction", - "automodule", - "exreflist", - "gdot", - "image-sg", - "pr", - "runpython", -] -ignore_roles = ["epkg"] - [tool.ruff] # Exclude a variety of commonly ignored directories. @@ -25,11 +11,11 @@ exclude = [ # Same as Black. line-length = 88 -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "_doc/examples/plot_first_example.py" = ["E402", "F811"] "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"] "pandas_streaming/data/__init__.py" = ["F401"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 679ba5a..0e28c6c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,6 @@ pycodestyle pylint>=2.14.0 pytest pytest-cov -rstcheck[sphinx,toml] ruff scikit-learn scipy