Skip to content

Commit cf46d41

Browse files
committed
build: use a single compose file with .env file
chore: add .env.debug configuration chore: add .env.debug feat: primitive parquet reader with page pruning add poetry build for libviewer add libviewer to rows refactor: only extract metadata and don't try to calculate offset index ci: update dockerfiles to include the rust toolchain and libviewer chore: pin python to 3.12.11 in libviewer and update lockfile feat: use PageIndexPolicy to optionally read offset index feat: support querying RowsIndex with page pruning build: add libviewer as a dependency to libcommon style: ruff format libcommon changes chore: use query_with_page_pruning from the rows endpoint chore: fix mypy errors style: import Sequence from collections.abc build: don't use libviewer as an editable dependency build: try to configure poetry to properly install libviewer ci: temporarily disable poetry cache style: fixx ruff check errors build: relock projects depending on libcommon build: add rust toolchain to more dockerfiles build: copy the entire libviewer directory in dockerfiles because poetry install is called at the build phase build: turn libviewer an optional dependency due to build difficulties chore: missing api stage from dockerfile ci: install libviewer extra in the libcommon build style: fix ruff check error in parquet utils ci: disable poetry cache feat: raise TooBigRows exceptions if the scan size would exceed a limit feat: implement binary truncation for page pruning reader style: ignore variable shadowing ruff check ci: install libviewer in the worker image feat: pass hf_token to the opendal store chore: remove files_to_index estimation chore: poetry lock worker service chore: remove reduntand gitignore entries from libviewer ci: install libviewer in the worker build style: fix mypy ignore chore: cleanup the libviewer python code style: try to please mypy due to missing import style: make token optional test: make the mocking compatible with the page pruning reader in test_first_rows
1 parent 21599e4 commit cf46d41

File tree

35 files changed

+4877
-184
lines changed

35 files changed

+4877
-184
lines changed

.dockerignore

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ assets
1414
**/.pytest_cache
1515
**/.coverage
1616
**/coverage.xml
17-
18-
# rust build artifacts
19-
libs/libviewer/target
17+
**/target
18+
**/*.rs.bk
19+
**/.cargo/registry
20+
**/.cargo/git

.github/workflows/_unit-tests-python.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ on:
88
working-directory:
99
required: true
1010
type: string
11+
poetry-args:
12+
required: false
13+
type: string
14+
default: ""
1115
env:
1216
# required to get access to use a cached poetry venv in "/home/runner/.cache/pypoetry/virtualenvs"
1317
POETRY_VIRTUALENVS_IN_PROJECT: false
@@ -31,16 +35,16 @@ jobs:
3135
uses: actions/setup-python@v5
3236
with:
3337
python-version: ${{ env.python-version }}
34-
cache: "poetry"
35-
cache-dependency-path: |
36-
${{ inputs.working-directory }}/poetry.lock
38+
# cache: "poetry"
39+
# cache-dependency-path: |
40+
# ${{ inputs.working-directory }}/poetry.lock
3741
- name: Install packages
3842
run: sudo apt update; sudo apt install -y ffmpeg libavcodec-extra libsndfile1
3943
- name: Install dependencies
4044
# "poetry env use" is required: https://github.com/actions/setup-python/issues/374#issuecomment-1088938718
4145
run: |
4246
poetry env use "${{ env.python-version }}"
43-
poetry install
47+
poetry install ${{ inputs.poetry-args }}
4448
- name: Launch mongo
4549
env:
4650
MONGO_PORT: ${{ env.mongo-port }}

.github/workflows/l-libcommon.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@ jobs:
2929
uses: ./.github/workflows/_unit-tests-python.yml
3030
with:
3131
working-directory: libs/libcommon
32+
poetry-args: "--with libviewer"
3233
secrets: inherit

.github/workflows/s-worker.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,5 @@ jobs:
3131
uses: ./.github/workflows/_unit-tests-python.yml
3232
with:
3333
working-directory: services/worker
34+
poetry-args: "--with libviewer"
3435
secrets: inherit

Dockerfile

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,120 +1,140 @@
11
# Multi-stage Dockerfile for all dataset-viewer services and jobs
22
# Build with: docker build --target <service_name> -t <tag> .
33

4+
ARG PYTHON_VERSION=3.12.11
5+
FROM python:${PYTHON_VERSION}-slim AS viewer
6+
7+
# Install Rust and minimal build deps
8+
RUN apt-get update \
9+
&& apt-get install -y --no-install-recommends curl build-essential \
10+
&& rm -rf /var/lib/apt/lists/*
11+
12+
# Install Rust toolchain and maturin
13+
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y \
14+
&& . $HOME/.cargo/env \
15+
&& pip install maturin \
16+
&& rustc --version \
17+
&& cargo --version
18+
# Add cargo bin dir to PATH (so maturin + cargo available globally)
19+
ENV PATH="/root/.cargo/bin:${PATH}"
20+
21+
# Build libviewer
22+
COPY libs/libviewer /src/libs/libviewer
23+
WORKDIR /src/libs/libviewer
24+
RUN maturin build --release --strip --out /tmp/dist
25+
426
# Base stage with shared setup
5-
FROM python:3.12.11-slim AS common
27+
FROM python:${PYTHON_VERSION}-slim AS common
628

729
# System dependencies
830
RUN apt-get update \
931
&& apt-get install -y unzip wget procps htop ffmpeg libavcodec-extra libsndfile1 \
1032
&& rm -rf /var/lib/apt/lists/*
1133

1234
# Common environment variables
35+
ARG POETRY_VERSION=2.1.4
1336
ENV PYTHONFAULTHANDLER=1 \
1437
PYTHONUNBUFFERED=1 \
1538
PYTHONHASHSEED=random \
1639
PIP_NO_CACHE_DIR=1 \
1740
PIP_DISABLE_PIP_VERSION_CHECK=on \
1841
PIP_DEFAULT_TIMEOUT=100 \
1942
POETRY_NO_INTERACTION=1 \
20-
POETRY_VERSION=2.1.4 \
2143
POETRY_VIRTUALENVS_CREATE=false \
2244
PATH="$PATH:/root/.local/bin"
2345

2446
# Install pip and poetry
25-
RUN pip install -U pip && pip install "poetry==$POETRY_VERSION"
47+
RUN pip install -U pip && pip install "poetry==${POETRY_VERSION}"
2648

2749
# Install libcommon's dependencies but not libcommon itself
2850
COPY libs/libcommon/poetry.lock \
2951
libs/libcommon/pyproject.toml \
3052
/src/libs/libcommon/
31-
RUN poetry install --no-cache --no-root --no-directory -P /src/libs/libcommon
32-
33-
# Base image for services including libapi's dependencies
34-
FROM common AS service
35-
COPY libs/libapi/poetry.lock \
36-
libs/libapi/pyproject.toml \
37-
/src/libs/libapi/
38-
RUN poetry install --no-cache --no-root --no-directory -P /src/libs/libapi
53+
WORKDIR /src/libs/libcommon
54+
RUN poetry install --no-cache --no-root
3955

4056
# Below are the actual API services which depend on libapi and libcommon.
41-
# Since the majority of the dependencies are already installed in the `api`
42-
# we let poetry to actually install the `libs` and the specific service.
57+
# Since the majority of the dependencies are already installed in the
58+
# `common` stage we let poetry to handle the rest.
4359

4460
# API service
45-
FROM service AS api
61+
FROM common AS api
4662
COPY libs /src/libs
4763
COPY services/api /src/services/api
48-
RUN poetry install --no-cache -P /src/services/api
49-
WORKDIR /src/services/api/
64+
WORKDIR /src/services/api
65+
RUN poetry install --no-cache
5066
ENTRYPOINT ["poetry", "run", "python", "src/api/main.py"]
5167

5268
# Admin service
53-
FROM service AS admin
69+
FROM common AS admin
5470
COPY libs /src/libs
5571
COPY services/admin /src/services/admin
56-
RUN poetry install --no-cache -P /src/services/admin
57-
WORKDIR /src/services/admin/
72+
WORKDIR /src/services/admin
73+
RUN poetry install --no-cache
5874
ENTRYPOINT ["poetry", "run", "python", "src/admin/main.py"]
5975

6076
# Rows service
61-
FROM service AS rows
77+
FROM common AS rows
78+
COPY --from=viewer /tmp/dist /tmp/dist
79+
RUN pip install /tmp/dist/libviewer-*.whl
6280
COPY libs /src/libs
6381
COPY services/rows /src/services/rows
64-
RUN poetry install --no-cache -P /src/services/rows
65-
WORKDIR /src/services/rows/
82+
WORKDIR /src/services/rows
83+
RUN poetry install --no-cache
6684
ENTRYPOINT ["poetry", "run", "python", "src/rows/main.py"]
6785

6886
# Search service
69-
FROM service AS search
87+
FROM common AS search
7088
COPY libs /src/libs
7189
COPY services/search /src/services/search
72-
RUN poetry install --no-cache -P /src/services/search
73-
WORKDIR /src/services/search/
90+
WORKDIR /src/services/search
91+
RUN poetry install --no-cache
7492
ENTRYPOINT ["poetry", "run", "python", "src/search/main.py"]
7593

7694
# SSE API service
77-
FROM service AS sse-api
95+
FROM common AS sse-api
7896
COPY libs /src/libs
7997
COPY services/sse-api /src/services/sse-api
80-
RUN poetry install --no-cache -P /src/services/sse-api
81-
WORKDIR /src/services/sse-api/
98+
WORKDIR /src/services/sse-api
99+
RUN poetry install --no-cache
82100
ENTRYPOINT ["poetry", "run", "python", "src/sse_api/main.py"]
83101

84102
# Webhook service
85-
FROM service AS webhook
103+
FROM common AS webhook
86104
COPY libs /src/libs
87105
COPY services/webhook /src/services/webhook
88-
RUN poetry install --no-cache -P /src/services/webhook
89-
WORKDIR /src/services/webhook/
106+
WORKDIR /src/services/webhook
107+
RUN poetry install --no-cache
90108
ENTRYPOINT ["poetry", "run", "python", "src/webhook/main.py"]
91109

92110
# Worker service
93111
FROM common AS worker
112+
COPY --from=viewer /tmp/dist /tmp/dist
113+
RUN pip install /tmp/dist/libviewer-*.whl
94114
COPY libs /src/libs
95115
COPY services/worker /src/services/worker
116+
WORKDIR /src/services/worker
96117
# presidio-analyzer > spacy > thinc doesn't ship aarch64 wheels so need to compile
97118
RUN if [ "$(uname -m)" = "aarch64" ]; then \
98119
apt-get update && apt-get install -y build-essential && \
99120
rm -rf /var/lib/apt/lists/*; \
100121
fi
101-
RUN poetry install --no-cache -P /src/services/worker
122+
RUN poetry install --no-cache
102123
RUN python -m spacy download en_core_web_lg
103-
WORKDIR /src/services/worker/
104124
ENTRYPOINT ["poetry", "run", "python", "src/worker/main.py"]
105125

106126
# Cache maintenance job
107127
FROM common AS cache_maintenance
108128
COPY libs /src/libs
109129
COPY jobs/cache_maintenance /src/jobs/cache_maintenance
110-
RUN poetry install --no-cache -P /src/jobs/cache_maintenance
111-
WORKDIR /src/jobs/cache_maintenance/
130+
WORKDIR /src/jobs/cache_maintenance
131+
RUN poetry install --no-cache
112132
ENTRYPOINT ["poetry", "run", "python", "src/cache_maintenance/main.py"]
113133

114134
# MongoDB migration job
115135
FROM common AS mongodb_migration
116136
COPY libs /src/libs
117137
COPY jobs/mongodb_migration /src/jobs/mongodb_migration
118-
RUN poetry install --no-cache -P /src/jobs/mongodb_migration
119-
WORKDIR /src/jobs/mongodb_migration/
120-
ENTRYPOINT ["poetry", "run", "python", "src/mongodb_migration/main.py"]
138+
WORKDIR /src/jobs/mongodb_migration
139+
RUN poetry install --no-cache
140+
ENTRYPOINT ["poetry", "run", "python", "src/mongodb_migration/main.py"]

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ e2e:
4949

5050
.PHONY: install
5151
install:
52+
$(MAKE) -C libs/libviewer install
5253
$(MAKE) -C libs/libcommon install
5354
$(MAKE) -C libs/libapi install
5455
$(MAKE) -C jobs/cache_maintenance install

jobs/mongodb_migration/poetry.lock

Lines changed: 36 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)