Skip to content

Commit 1a70dd0

Browse files
bovlbCopilot
andauthored
Support find similar search (#122)
* checkpoint * checkpoint * checkpoint * checkpoint * seems to be working * tidy * checkpoint * Basic import of tables * remove files no longer needed * Basic execute working * docs and CI * Fix JSON encoding * doc * Smaller batch size is safer, maybe slower * rename to sql-server * Use WF_AUTH_TOKEN * complete thought * complete thought * revert unintended change * trying to add blobs * fix _blob * make _image and _as_format work * refactor for simpler code, no pre-batch query * Add operations * Update apps/sql-server/fdw/fdw/__init__.py Co-authored-by: Copilot <[email protected]> * PR feedback * use dotenv * checkpoint * Fix merge issues * more cleanup * checkpoint * checkpoint * operations checking * Add some enums for videos * Add access control; fix some issue with descriptor tables * PR feedback * Cleaner pool and schema handling. * Fix suggested by Drew for envar issues * pinpoint fix * Update apps/sql-server/app/app.sh Co-authored-by: Copilot <[email protected]> * Add HTTP server; fix FDW log level * Fix swagger * response to PR feedback * Add check_properties * fix merge problem * Add FIND_SIMILAR * checkpoint * Be consistent about embeddings requirements * descriptor tables have specialized column sets; detecting embeddability * checkpoint * find similar working * checkpoint * Finish general rewrite; use personal branch of multicorn2 * fix comment * Hard-code distances * Better boolean handling * trivial change * More performance reporting * fix type hint * remove extraneous parameter * fix type hint * fix type hints * fix type hint * minor fixes * Remove temporary logging * Update apps/sql-server/fdw/fdw/descriptor.py Co-authored-by: Copilot <[email protected]> * Update apps/sql-server/fdw/fdw/common.py Co-authored-by: Copilot <[email protected]> * Update apps/sql-server/fdw/fdw/descriptor.py Co-authored-by: Copilot <[email protected]> * fix copy-paste error * Update apps/sql-server/fdw/fdw/common.py Co-authored-by: Copilot <[email protected]> * more timing * Add explain * tweak EXPLAIN * PR has been merged upstream * Ad docstring on Curry * Fix typo in field name * Enable extra field rejection for TableOptions and ColumnOptions and fix one case --------- Co-authored-by: Copilot <[email protected]>
1 parent dfa96e8 commit 1a70dd0

File tree

13 files changed

+1197
-398
lines changed

13 files changed

+1197
-398
lines changed

apps/crawl-to-rag/Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ FROM aperturedata/workflows-base
88

99
ENV APP_NAME=workflows-crawl-to-rag
1010

11-
# Needed for text-embeddings
12-
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
11+
# Install dependencies for embeddings
12+
RUN pip install --no-cache-dir -r /app/embeddings/requirements_cpu.txt
13+
RUN pip install --no-cache-dir -r /app/embeddings/requirements.txt
1314

1415
# copy in the app directories
1516
COPY --from=crawl-website /app /workflows/crawl-website
@@ -28,6 +29,7 @@ RUN pip install --no-cache-dir -r /requirements.txt
2829
COPY --from=rag /requirements.txt /requirements.txt
2930
RUN pip install --no-cache-dir -r /requirements.txt
3031

32+
3133
EXPOSE 8000
3234

3335
COPY app.sh /app/

apps/rag/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
88
COPY requirements.txt /
99
RUN pip install --no-cache-dir -r /requirements.txt
1010

11+
# Install dependencies for embeddings
12+
RUN pip install --no-cache-dir -r /app/embeddings/requirements_cpu.txt
13+
RUN pip install --no-cache-dir -r /app/embeddings/requirements.txt
14+
1115
# We prefer to cache models in the docker image rather than load them
1216
# at run time.
1317
COPY app/llm.py /app/llm.py

apps/sql-server/Dockerfile

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ ENV APP_NAME=workflows-sql-server
55
ENV POSTGRES_VERSION=17
66
ARG MULTICORN_VERSION=3.0
77

8-
# Add PGDG repository and install PostgreSQL 17
8+
# Add PGDG repository and install PostgreSQL
99
RUN apt-get update && apt-get install -y wget gnupg lsb-release \
1010
&& echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" \
1111
> /etc/apt/sources.list.d/pgdg.list \
@@ -23,28 +23,39 @@ RUN echo "listen_addresses = '*'" >> /etc/postgresql/${POSTGRES_VERSION}/main/po
2323
RUN echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/17/main/pg_hba.conf
2424

2525
# Postgres/Multicorn insists on using the system Python, so we need to disable the virtual environment
26+
# Store current VIRTUAL_ENV and PATH values
27+
ENV OLD_VIRTUAL_ENV="${VIRTUAL_ENV}"
28+
ENV OLD_PATH="${PATH}"
29+
ENV OLD_PYTHONPATH="${PYTHONPATH}"
30+
31+
# Disable virtual environment
2632
ENV VIRTUAL_ENV=
2733
ENV PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
34+
ENV PYTHONPATH=""
2835

2936
# Install multicorn2 Python module into system Python
3037
RUN /usr/bin/python3 -m pip install --no-cache-dir "git+https://github.com/pgsql-io/multicorn2.git"
3138

32-
# Build and install multicorn2 Postgres extension
33-
RUN git clone https://github.com/pgsql-io/multicorn2.git /multicorn2 \
39+
# Build and install multicorn2 Postgres extension
40+
RUN git clone --single-branch --branch main https://github.com/pgsql-io/multicorn2.git /multicorn2 \
3441
&& cd /multicorn2 \
3542
&& make PYTHON=/usr/bin/python3 \
3643
&& make install
3744

38-
RUN pip install --no-cache-dir aperturedb dotenv
45+
RUN pip install --no-cache-dir aperturedb dotenv numpy pydantic
46+
47+
# Install dependencies for embeddings
48+
RUN pip install --no-cache-dir -r /app/embeddings/requirements_cpu.txt
49+
RUN pip install --no-cache-dir -r /app/embeddings/requirements.txt
3950

4051
# Copy and install our FDW into system Python
4152
COPY fdw /fdw
4253
RUN cd /fdw && /usr/bin/python3 -m pip install .
4354

4455
# Restore virtual environment
45-
ENV VIRTUAL_ENV=/opt/venv
46-
ENV PATH="/opt/venv/bin:/opt/venv/lib/python3.10/site-packages:$PATH"
47-
ENV PYTHONPATH="/app:/opt/venv/lib/python3.10/site-packages"
56+
ENV VIRTUAL_ENV=${OLD_VIRTUAL_ENV}
57+
ENV PATH="${OLD_PATH}"
58+
ENV PYTHONPATH="/app:${OLD_PYTHONPATH}"
4859

4960
# Install application requirements
5061
COPY requirements.txt /requirements.txt

apps/sql-server/app/sql/functions.sql

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,4 +102,43 @@ $$ LANGUAGE SQL IMMUTABLE;
102102
CREATE OR REPLACE FUNCTION OPERATIONS(VARIADIC ops jsonb[])
103103
RETURNS jsonb AS $$
104104
SELECT jsonb_agg(op) FROM unnest($1) AS op
105-
$$ LANGUAGE SQL IMMUTABLE;
105+
$$ LANGUAGE SQL IMMUTABLE;
106+
107+
108+
-- Find similar
109+
110+
CREATE OR REPLACE FUNCTION FIND_SIMILAR(
111+
text TEXT DEFAULT NULL,
112+
image BYTEA DEFAULT NULL,
113+
vector JSONB DEFAULT NULL,
114+
k INT DEFAULT 10,
115+
knn_first BOOLEAN DEFAULT TRUE
116+
) RETURNS JSONB AS $$
117+
DECLARE
118+
mode_count INT;
119+
BEGIN
120+
-- Count how many modes are specified
121+
mode_count := (CASE WHEN text IS NOT NULL THEN 1 ELSE 0 END) +
122+
(CASE WHEN image IS NOT NULL THEN 1 ELSE 0 END) +
123+
(CASE WHEN vector IS NOT NULL THEN 1 ELSE 0 END);
124+
125+
IF mode_count != 1 THEN
126+
RAISE EXCEPTION 'FIND_SIMILAR requires exactly one of text, image, or vector';
127+
END IF;
128+
129+
IF k IS NULL OR k <= 0 THEN
130+
RAISE EXCEPTION 'k must be a positive integer';
131+
END IF;
132+
133+
RETURN jsonb_build_object(
134+
'type', 'find_similar',
135+
'text', text,
136+
'image', image,
137+
'vector', vector,
138+
'k_neighbors', k,
139+
'knn_first', knn_first
140+
);
141+
END;
142+
$$ LANGUAGE plpgsql IMMUTABLE;
143+
144+
COMMENT ON FUNCTION FIND_SIMILAR IS 'Find similar items based on one of text, image, or vector.';

0 commit comments

Comments
 (0)