Skip to content

Commit 31baf09

Browse files
marashkaMarat Akhmetov
andauthored
[DOP-23195] Improved full-text search (#255)
* [DOP-23195] update ts * [DOP-23195] added comments to the vectors * [DOP-23195] added changelog * [DOP-23195] changed base docker image --------- Co-authored-by: Marat Akhmetov <[email protected]>
1 parent f66637b commit 31baf09

File tree

16 files changed

+497
-38
lines changed

16 files changed

+497
-38
lines changed

docker/Dockerfile.scheduler

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG PYTHON_VERSION=3.13
2-
FROM python:$PYTHON_VERSION-slim AS base
2+
FROM python:$PYTHON_VERSION-slim-bookworm AS base
33

44
WORKDIR /app
55
ENV PYTHONPATH=/app \

docker/Dockerfile.server

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG PYTHON_VERSION=3.13
2-
FROM python:$PYTHON_VERSION-slim AS base
2+
FROM python:$PYTHON_VERSION-slim-bookworm AS base
33

44
RUN apt-get update \
55
&& apt-get install -y --no-install-recommends \

docker/Dockerfile.worker

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG PYTHON_VERSION=3.13
2-
FROM python:$PYTHON_VERSION-slim AS base
2+
FROM python:$PYTHON_VERSION-slim-bookworm AS base
33

44
RUN apt-get update && apt-get install -y --no-install-recommends \
55
openjdk-17-jdk-headless \
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improved full-text search by refining ``tsvector`` generation and adding better tokenization for technical fields such as hostnames, table names, and directory paths.
2+
-- by :github:user:`marashka`
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
# SPDX-FileCopyrightText: 2023-2024 MTS PJSC
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""Update text search
4+
5+
Revision ID: 0012
6+
Revises: 0011
7+
Create Date: 2025-08-10 20:03:02.105470
8+
9+
"""
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
from sqlalchemy.dialects import postgresql
14+
15+
# revision identifiers, used by Alembic.
16+
revision = "0012"
17+
down_revision = "0011"
18+
branch_labels = None
19+
depends_on = None
20+
21+
22+
def upgrade() -> None:
23+
op.drop_index(op.f("idx_connection_search_vector"), table_name="connection", postgresql_using="gin")
24+
op.drop_column("connection", "search_vector")
25+
op.drop_column("group", "search_vector")
26+
op.drop_index(op.f("idx_transfer_search_vector"), table_name="transfer", postgresql_using="gin")
27+
op.drop_column("transfer", "search_vector")
28+
op.drop_column("queue", "search_vector")
29+
30+
op.add_column(
31+
"connection",
32+
sa.Column(
33+
"search_vector",
34+
postgresql.TSVECTOR(),
35+
sa.Computed(
36+
"""
37+
-- === NAME FIELD ===
38+
-- Russian stemming for better morphological matching of regular words
39+
to_tsvector('russian', coalesce(name, ''))
40+
-- Simple dictionary (no stemming) for exact token match
41+
|| to_tsvector('simple', coalesce(name, ''))
42+
-- Simple dictionary with translate(): split by . / - _ : \
43+
-- (used when 'name' contains technical fields)
44+
|| to_tsvector(
45+
'simple',
46+
translate(coalesce(name, ''), './-_:\\', ' ')
47+
)
48+
49+
-- === HOST FIELD (from JSON) ===
50+
-- Simple dictionary (no stemming) for exact match
51+
|| to_tsvector('simple', coalesce(data->>'host', ''))
52+
-- Simple dictionary with translate(): split by . / - _ : \\ for partial token matching
53+
|| to_tsvector(
54+
'simple',
55+
translate(coalesce(data->>'host', ''), './-_:\\', ' ')
56+
)
57+
""",
58+
persisted=True,
59+
),
60+
nullable=False,
61+
),
62+
)
63+
op.create_index(
64+
"idx_connection_search_vector",
65+
"connection",
66+
["search_vector"],
67+
unique=False,
68+
postgresql_using="gin",
69+
)
70+
71+
op.add_column(
72+
"group",
73+
sa.Column(
74+
"search_vector",
75+
postgresql.TSVECTOR(),
76+
sa.Computed(
77+
"""
78+
-- === NAME FIELD ===
79+
-- Russian stemming for better morphological matching of regular words
80+
to_tsvector('russian', coalesce(name, ''))
81+
-- Simple dictionary (no stemming) for exact token match
82+
|| to_tsvector('simple', coalesce(name, ''))
83+
-- Simple dictionary with translate(): split by . / - _ : \
84+
-- (used when 'name' contains technical fields)
85+
|| to_tsvector(
86+
'simple',
87+
translate(coalesce(name, ''), './-_:\\', ' ')
88+
)
89+
""",
90+
persisted=True,
91+
),
92+
nullable=False,
93+
),
94+
)
95+
96+
op.add_column(
97+
"queue",
98+
sa.Column(
99+
"search_vector",
100+
postgresql.TSVECTOR(),
101+
sa.Computed(
102+
"""
103+
-- === NAME FIELD ===
104+
-- Russian stemming for better morphological matching of regular words
105+
to_tsvector('russian', coalesce(name, ''))
106+
-- Simple dictionary (no stemming) for exact token match
107+
|| to_tsvector('simple', coalesce(name, ''))
108+
-- Simple dictionary with translate(): split by . / - _ : \
109+
-- (used when 'name' contains technical fields)
110+
|| to_tsvector(
111+
'simple',
112+
translate(coalesce(name, ''), './-_:\\', ' ')
113+
)
114+
""",
115+
persisted=True,
116+
),
117+
nullable=False,
118+
),
119+
)
120+
121+
op.add_column(
122+
"transfer",
123+
sa.Column(
124+
"search_vector",
125+
postgresql.TSVECTOR(),
126+
sa.Computed(
127+
"""
128+
-- === NAME FIELD ===
129+
-- Russian stemming for better morphological matching of regular words
130+
to_tsvector('russian', coalesce(name, ''))
131+
-- Simple dictionary (no stemming) for exact token match
132+
|| to_tsvector('simple', coalesce(name, ''))
133+
-- Simple dictionary with translate(): split by . / - _ : \
134+
-- (used when 'name' contains technical fields)
135+
|| to_tsvector(
136+
'simple',
137+
translate(coalesce(name, ''), './-_:\\', ' ')
138+
)
139+
140+
-- === TABLE NAME FIELDS ===
141+
-- Simple dictionary (no stemming) for exact match
142+
|| to_tsvector('simple', coalesce(source_params->>'table_name', ''))
143+
|| to_tsvector('simple', coalesce(target_params->>'table_name', ''))
144+
-- Simple dictionary with translate(): split by . / - _ : \\ for partial token matching
145+
|| to_tsvector(
146+
'simple',
147+
translate(coalesce(source_params->>'table_name', ''), './-_:\\', ' ')
148+
)
149+
|| to_tsvector(
150+
'simple',
151+
translate(coalesce(target_params->>'table_name', ''), './-_:\\', ' ')
152+
)
153+
154+
-- === DIRECTORY PATH FIELDS ===
155+
-- Simple dictionary (no stemming) for exact match
156+
|| to_tsvector('simple', coalesce(source_params->>'directory_path', ''))
157+
|| to_tsvector('simple', coalesce(target_params->>'directory_path', ''))
158+
-- Simple dictionary with translate(): split by . / - _ : \\ for partial token matching
159+
|| to_tsvector(
160+
'simple',
161+
translate(coalesce(source_params->>'directory_path', ''), './-_:\\', ' ')
162+
)
163+
|| to_tsvector(
164+
'simple',
165+
translate(coalesce(target_params->>'directory_path', ''), './-_:\\', ' ')
166+
)
167+
""",
168+
persisted=True,
169+
),
170+
nullable=False,
171+
),
172+
)
173+
op.create_index("idx_transfer_search_vector", "transfer", ["search_vector"], unique=False, postgresql_using="gin")
174+
175+
176+
def downgrade() -> None:
177+
op.drop_index("idx_transfer_search_vector", table_name="transfer", postgresql_using="gin")
178+
op.drop_column("transfer", "search_vector")
179+
op.drop_column("group", "search_vector")
180+
op.drop_index("idx_connection_search_vector", table_name="connection", postgresql_using="gin")
181+
op.drop_column("connection", "search_vector")
182+
op.drop_column("queue", "search_vector")
183+
184+
op.add_column(
185+
"transfer",
186+
sa.Column(
187+
"search_vector",
188+
postgresql.TSVECTOR(),
189+
sa.Computed(
190+
"to_tsvector('english'::regconfig, (((((((((((((((((((name)::text || ' '::text) || COALESCE(json_extract_path_text(source_params, VARIADIC ARRAY['table_name'::text]), ''::text)) || ' '::text) || COALESCE(json_extract_path_text(target_params, VARIADIC ARRAY['table_name'::text]), ''::text)) || ' '::text) || COALESCE(json_extract_path_text(source_params, VARIADIC ARRAY['directory_path'::text]), ''::text)) || ' '::text) || COALESCE(json_extract_path_text(target_params, VARIADIC ARRAY['directory_path'::text]), ''::text)) || ' '::text) || translate((name)::text, './'::text, ' '::text)) || ' '::text) || COALESCE(translate(json_extract_path_text(source_params, VARIADIC ARRAY['table_name'::text]), './'::text, ' '::text), ''::text)) || ' '::text) || COALESCE(translate(json_extract_path_text(target_params, VARIADIC ARRAY['table_name'::text]), './'::text, ' '::text), ''::text)) || ' '::text) || COALESCE(translate(json_extract_path_text(source_params, VARIADIC ARRAY['directory_path'::text]), './'::text, ' '::text), ''::text)) || ' '::text) || COALESCE(translate(json_extract_path_text(target_params, VARIADIC ARRAY['directory_path'::text]), './'::text, ' '::text), ''::text)))",
191+
persisted=True,
192+
),
193+
autoincrement=False,
194+
nullable=False,
195+
),
196+
)
197+
op.create_index(
198+
op.f("idx_transfer_search_vector"),
199+
"transfer",
200+
["search_vector"],
201+
unique=False,
202+
postgresql_using="gin",
203+
)
204+
op.add_column(
205+
"group",
206+
sa.Column(
207+
"search_vector",
208+
postgresql.TSVECTOR(),
209+
sa.Computed("to_tsvector('english'::regconfig, (name)::text)", persisted=True),
210+
autoincrement=False,
211+
nullable=False,
212+
),
213+
)
214+
op.add_column(
215+
"connection",
216+
sa.Column(
217+
"search_vector",
218+
postgresql.TSVECTOR(),
219+
sa.Computed(
220+
"to_tsvector('english'::regconfig, (((((name)::text || ' '::text) || COALESCE(json_extract_path_text(data, VARIADIC ARRAY['host'::text]), ''::text)) || ' '::text) || COALESCE(translate(json_extract_path_text(data, VARIADIC ARRAY['host'::text]), '.'::text, ' '::text), ''::text)))",
221+
persisted=True,
222+
),
223+
autoincrement=False,
224+
nullable=False,
225+
),
226+
)
227+
op.create_index(
228+
op.f("idx_connection_search_vector"),
229+
"connection",
230+
["search_vector"],
231+
unique=False,
232+
postgresql_using="gin",
233+
)
234+
op.add_column(
235+
"queue",
236+
sa.Column(
237+
"search_vector",
238+
postgresql.TSVECTOR(),
239+
sa.Computed("to_tsvector('english'::regconfig, (name)::text)", persisted=True),
240+
autoincrement=False,
241+
nullable=False,
242+
),
243+
)

syncmaster/db/models/connection.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,25 @@ class Connection(Base, ResourceMixin, TimestampMixin):
4141
TSVECTOR,
4242
Computed(
4343
"""
44-
to_tsvector(
45-
'english'::regconfig,
46-
name || ' ' ||
47-
COALESCE(json_extract_path_text(data, 'host'), '') || ' ' ||
48-
COALESCE(translate(json_extract_path_text(data, 'host'), '.', ' '), '')
44+
-- === NAME FIELD ===
45+
-- Russian stemming for better morphological matching of regular words
46+
to_tsvector('russian', coalesce(name, ''))
47+
-- Simple dictionary (no stemming) for exact token match
48+
|| to_tsvector('simple', coalesce(name, ''))
49+
-- Simple dictionary with translate(): split by . / - _ : \
50+
-- (used when 'name' contains technical fields)
51+
|| to_tsvector(
52+
'simple',
53+
translate(coalesce(name, ''), './-_:\\', ' ')
54+
)
55+
56+
-- === HOST FIELD (from JSON) ===
57+
-- Simple dictionary (no stemming) for exact match
58+
|| to_tsvector('simple', coalesce(data->>'host', ''))
59+
-- Simple dictionary with translate(): split by . / - _ : \\ for partial token matching
60+
|| to_tsvector(
61+
'simple',
62+
translate(coalesce(data->>'host', ''), './-_:\\', ' ')
4963
)
5064
""",
5165
persisted=True,

syncmaster/db/models/group.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,26 @@ class Group(Base, TimestampMixin):
7777

7878
owner: Mapped[User] = relationship(User)
7979
queue: Mapped[Queue] = relationship(back_populates="group", cascade="all, delete-orphan")
80-
8180
search_vector: Mapped[str] = mapped_column(
8281
TSVECTOR,
83-
Computed("to_tsvector('english'::regconfig, name)", persisted=True),
82+
Computed(
83+
"""
84+
-- === NAME FIELD ===
85+
-- Russian stemming for better morphological matching of regular words
86+
to_tsvector('russian', coalesce(name, ''))
87+
-- Simple dictionary (no stemming) for exact token match
88+
|| to_tsvector('simple', coalesce(name, ''))
89+
-- Simple dictionary with translate(): split by . / - _ : \
90+
-- (used when 'name' contains technical fields)
91+
|| to_tsvector(
92+
'simple',
93+
translate(coalesce(name, ''), './-_:\\', ' ')
94+
)
95+
""",
96+
persisted=True,
97+
),
8498
nullable=False,
8599
deferred=True,
86-
doc="Full-text search vector",
87100
)
88101

89102
def __repr__(self) -> str:

syncmaster/db/models/queue.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,24 @@ class Queue(Base, ResourceMixin, TimestampMixin):
2525

2626
search_vector: Mapped[str] = mapped_column(
2727
TSVECTOR,
28-
Computed("to_tsvector('english'::regconfig, name)", persisted=True),
28+
Computed(
29+
"""
30+
-- === NAME FIELD ===
31+
-- Russian stemming for better morphological matching of regular words
32+
to_tsvector('russian', coalesce(name, ''))
33+
-- Simple dictionary (no stemming) for exact token match
34+
|| to_tsvector('simple', coalesce(name, ''))
35+
-- Simple dictionary with translate(): split by . / - _ : \
36+
-- (used when 'name' contains technical fields)
37+
|| to_tsvector(
38+
'simple',
39+
translate(coalesce(name, ''), './-_:\\', ' ')
40+
)
41+
""",
42+
persisted=True,
43+
),
2944
nullable=False,
3045
deferred=True,
31-
doc="Full-text search vector",
3246
)
3347

3448
def __repr__(self):

0 commit comments

Comments
 (0)