Skip to content

Commit 13c30f6

Browse files
feat(sdk): Add creation of indexes and indexing of messages (#5505)
Integrate matrix-sdk-search into matrix-sdk. When a room is joined, a corresponding index is created. When a message is received via sync or via a back-pagination, it is added to the corresponding room's index. Signed-off-by: Shrey Patel [email protected]
1 parent 0e70a2f commit 13c30f6

File tree

15 files changed

+459
-164
lines changed

15 files changed

+459
-164
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/matrix-sdk-search/src/error.rs

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,6 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! The event cache is an abstraction layer, sitting between the Rust SDK and a
16-
//! final client, that acts as a global observer of all the rooms, gathering and
17-
//! inferring some extra useful information about each room. In particular, this
18-
//! doesn't require subscribing to a specific room to get access to this
19-
//! information.
20-
//!
21-
//! It's intended to be fast, robust and easy to maintain, having learned from
22-
//! previous endeavours at implementing middle to high level features elsewhere
23-
//! in the SDK, notably in the UI's Timeline object.
24-
//!
25-
//! See the [github issue](https://github.com/matrix-org/matrix-rust-sdk/issues/3058) for more
26-
//! details about the historical reasons that led us to start writing this.
27-
2815
use tantivy::{
2916
directory::error::OpenDirectoryError as TantivyOpenDirectoryError,
3017
query::QueryParserError as TantivyQueryParserError,

crates/matrix-sdk-search/src/index.rs

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,9 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! The event cache is an abstraction layer, sitting between the Rust SDK and a
16-
//! final client, that acts as a global observer of all the rooms, gathering and
17-
//! inferring some extra useful information about each room. In particular, this
18-
//! doesn't require subscribing to a specific room to get access to this
19-
//! information.
20-
//!
21-
//! It's intended to be fast, robust and easy to maintain, having learned from
22-
//! previous endeavours at implementing middle to high level features elsewhere
23-
//! in the SDK, notably in the UI's Timeline object.
24-
//!
25-
//! See the [github issue](https://github.com/matrix-org/matrix-rust-sdk/issues/3058) for more
26-
//! details about the historical reasons that led us to start writing this.
27-
2815
use std::{fmt, fs, path::Path, sync::Arc};
2916

30-
use ruma::{OwnedEventId, OwnedRoomId, RoomId, events::AnyMessageLikeEvent};
17+
use ruma::{OwnedEventId, OwnedRoomId, RoomId, events::AnySyncMessageLikeEvent};
3118
use tantivy::{
3219
Index, IndexReader, TantivyDocument,
3320
collector::TopDocs,
@@ -44,6 +31,11 @@ use crate::{
4431
writer::SearchIndexWriter,
4532
};
4633

34+
/// A struct to represent the operations on a [`RoomIndex`]
35+
pub(crate) enum RoomIndexOperation {
36+
Add(TantivyDocument),
37+
}
38+
4739
/// A struct that holds all data pertaining to a particular room's
4840
/// message index.
4941
pub struct RoomIndex {
@@ -91,9 +83,9 @@ impl RoomIndex {
9183
RoomIndex::new_with(index, schema, room_id)
9284
}
9385

94-
/// Create new [`RoomIndex`] which stores the index in RAM.
86+
/// Create new [`RoomIndex`] which stores the index in memory.
9587
/// Intended for testing.
96-
pub fn new_in_ram(room_id: &RoomId) -> Result<RoomIndex, IndexError> {
88+
pub fn new_in_memory(room_id: &RoomId) -> Result<RoomIndex, IndexError> {
9789
let schema = RoomMessageSchema::new();
9890
let index = Index::create_in_ram(schema.as_tantivy_schema());
9991
RoomIndex::new_with(index, schema, room_id)
@@ -130,10 +122,14 @@ impl RoomIndex {
130122
RoomIndex::new_with(index, schema, room_id)
131123
}
132124

133-
/// Add [`AnyMessageLikeEvent`] to [`RoomIndex`]
134-
pub fn add_event(&mut self, event: AnyMessageLikeEvent) -> Result<(), IndexError> {
135-
let doc = self.schema.make_doc(event)?;
136-
self.writer.add_document(doc)?; // TODO: This is blocking. Handle it.
125+
/// Handle [`AnySyncMessageLikeEvent`]
126+
///
127+
/// This which will add/remove/edit an event in the index based on the
128+
/// event type.
129+
pub fn handle_event(&mut self, event: AnySyncMessageLikeEvent) -> Result<(), IndexError> {
130+
match self.schema.handle_event(event)? {
131+
RoomIndexOperation::Add(document) => self.writer.add_document(document)?,
132+
};
137133
Ok(())
138134
}
139135

@@ -193,74 +189,77 @@ mod tests {
193189
use std::{collections::HashSet, error::Error};
194190

195191
use matrix_sdk_test::event_factory::EventFactory;
196-
use ruma::{event_id, owned_event_id, room_id, user_id};
192+
use ruma::{event_id, room_id, user_id};
197193

198194
use crate::index::RoomIndex;
199195

200196
#[test]
201-
fn test_make_index_in_ram() {
197+
fn test_make_index_in_memory() {
202198
let room_id = room_id!("!room_id:localhost");
203-
let index = RoomIndex::new_in_ram(room_id);
199+
let index = RoomIndex::new_in_memory(room_id);
204200

205201
index.expect("failed to make index in ram: {index:?}");
206202
}
207203

208204
#[test]
209-
fn test_add_event() {
205+
fn test_handle_event() {
210206
let room_id = room_id!("!room_id:localhost");
211207
let mut index =
212-
RoomIndex::new_in_ram(room_id).expect("failed to make index in ram: {index:?}");
208+
RoomIndex::new_in_memory(room_id).expect("failed to make index in ram: {index:?}");
213209

214210
let event = EventFactory::new()
215211
.text_msg("event message")
216212
.event_id(event_id!("$event_id:localhost"))
217213
.room(room_id)
218214
.sender(user_id!("@user_id:localhost"))
219-
.into_any_message_like_event();
215+
.into_any_sync_message_like_event();
220216

221-
index.add_event(event).expect("failed to add event: {res:?}");
217+
index.handle_event(event).expect("failed to add event: {res:?}");
222218
}
223219

224220
#[test]
225221
fn test_search_populated_index() -> Result<(), Box<dyn Error>> {
226222
let room_id = room_id!("!room_id:localhost");
227223
let mut index =
228-
RoomIndex::new_in_ram(room_id).expect("failed to make index in ram: {index:?}");
224+
RoomIndex::new_in_memory(room_id).expect("failed to make index in ram: {index:?}");
225+
226+
let event_id_1 = event_id!("$event_id_1:localhost");
227+
let event_id_2 = event_id!("$event_id_2:localhost");
228+
let event_id_3 = event_id!("$event_id_3:localhost");
229229

230-
index.add_event(
230+
index.handle_event(
231231
EventFactory::new()
232232
.text_msg("This is a sentence")
233-
.event_id(event_id!("$event_id_1:localhost"))
233+
.event_id(event_id_1)
234234
.room(room_id)
235235
.sender(user_id!("@user_id:localhost"))
236-
.into_any_message_like_event(),
236+
.into_any_sync_message_like_event(),
237237
)?;
238238

239-
index.add_event(
239+
index.handle_event(
240240
EventFactory::new()
241241
.text_msg("All new words")
242-
.event_id(event_id!("$event_id_2:localhost"))
242+
.event_id(event_id_2)
243243
.room(room_id)
244244
.sender(user_id!("@user_id:localhost"))
245-
.into_any_message_like_event(),
245+
.into_any_sync_message_like_event(),
246246
)?;
247247

248-
index.add_event(
248+
index.handle_event(
249249
EventFactory::new()
250250
.text_msg("A similar sentence")
251-
.event_id(event_id!("$event_id_3:localhost"))
251+
.event_id(event_id_3)
252252
.room(room_id)
253253
.sender(user_id!("@user_id:localhost"))
254-
.into_any_message_like_event(),
254+
.into_any_sync_message_like_event(),
255255
)?;
256256

257257
index.commit_and_reload()?;
258258

259259
let result = index.search("sentence", 10).expect("search failed with: {result:?}");
260260
let result: HashSet<_> = result.iter().collect();
261261

262-
let true_value =
263-
[owned_event_id!("$event_id_1:localhost"), owned_event_id!("$event_id_3:localhost")];
262+
let true_value = [event_id_1.to_owned(), event_id_3.to_owned()];
264263
let true_value: HashSet<_> = true_value.iter().collect();
265264

266265
assert_eq!(result, true_value, "search result not correct: {result:?}");
@@ -272,7 +271,7 @@ mod tests {
272271
fn test_search_empty_index() -> Result<(), Box<dyn Error>> {
273272
let room_id = room_id!("!room_id:localhost");
274273
let mut index =
275-
RoomIndex::new_in_ram(room_id).expect("failed to make index in ram: {index:?}");
274+
RoomIndex::new_in_memory(room_id).expect("failed to make index in ram: {index:?}");
276275

277276
index.commit_and_reload()?;
278277

crates/matrix-sdk-search/src/schema.rs

Lines changed: 47 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,29 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! The event cache is an abstraction layer, sitting between the Rust SDK and a
16-
//! final client, that acts as a global observer of all the rooms, gathering and
17-
//! inferring some extra useful information about each room. In particular, this
18-
//! doesn't require subscribing to a specific room to get access to this
19-
//! information.
20-
//!
21-
//! It's intended to be fast, robust and easy to maintain, having learned from
22-
//! previous endeavours at implementing middle to high level features elsewhere
23-
//! in the SDK, notably in the UI's Timeline object.
24-
//!
25-
//! See the [github issue](https://github.com/matrix-org/matrix-rust-sdk/issues/3058) for more
26-
//! details about the historical reasons that led us to start writing this.
27-
28-
use ruma::{
29-
MilliSecondsSinceUnixEpoch, OwnedEventId, OwnedUserId,
30-
events::{
31-
AnyMessageLikeEvent, MessageLikeEvent, MessageLikeEventContent, RedactContent,
32-
RedactedMessageLikeEventContent, room::message::MessageType,
33-
},
15+
use ruma::events::{
16+
AnySyncMessageLikeEvent, MessageLikeEventContent, RedactContent,
17+
RedactedMessageLikeEventContent, SyncMessageLikeEvent, room::message::MessageType,
3418
};
3519
use tantivy::{
3620
DateTime, TantivyDocument, doc,
3721
schema::{DateOptions, DateTimePrecision, Field, INDEXED, STORED, STRING, Schema, TEXT},
3822
};
3923

40-
use crate::error::{IndexError, IndexSchemaError};
24+
use crate::{
25+
error::{IndexError, IndexSchemaError},
26+
index::RoomIndexOperation,
27+
};
4128

4229
pub(crate) trait MatrixSearchIndexSchema {
4330
fn new() -> Self;
4431
fn default_search_fields(&self) -> Vec<Field>;
4532
fn primary_key(&self) -> Field;
4633
fn as_tantivy_schema(&self) -> Schema;
47-
fn make_doc(&self, event: AnyMessageLikeEvent) -> Result<TantivyDocument, IndexError>;
34+
fn handle_event(
35+
&self,
36+
event: AnySyncMessageLikeEvent,
37+
) -> Result<RoomIndexOperation, IndexError>;
4838
}
4939

5040
#[derive(Debug, Clone)]
@@ -58,48 +48,31 @@ pub(crate) struct RoomMessageSchema {
5848
}
5949

6050
impl RoomMessageSchema {
61-
fn parse_event<C: MessageLikeEventContent + RedactContent, F>(
51+
/// Given an [`AnySyncMessageLikeEvent`] and a function to convert the
52+
/// content into a String to be indexed, return a [`TantivyDocument`] to
53+
/// index.
54+
fn make_doc<C: MessageLikeEventContent + RedactContent, F>(
6255
&self,
63-
event: MessageLikeEvent<C>,
64-
get_body: F,
65-
) -> Result<(OwnedEventId, String, MilliSecondsSinceUnixEpoch, OwnedUserId), IndexError>
56+
event: SyncMessageLikeEvent<C>,
57+
get_body_from_content: F,
58+
) -> Result<TantivyDocument, IndexError>
6659
where
6760
<C as RedactContent>::Redacted: RedactedMessageLikeEventContent,
6861
F: FnOnce(&C) -> Result<String, IndexError>,
6962
{
7063
let unredacted = event.as_original().ok_or(IndexError::CannotIndexRedactedMessage)?;
7164

72-
let body = get_body(&unredacted.content)?;
65+
let body = get_body_from_content(&unredacted.content)?;
7366

74-
Ok((
75-
unredacted.event_id.clone(),
76-
body,
77-
unredacted.origin_server_ts,
78-
unredacted.sender.clone(),
67+
Ok(doc!(
68+
self.event_id_field => unredacted.event_id.to_string(),
69+
self.body_field => body,
70+
self.date_field =>
71+
DateTime::from_timestamp_millis(
72+
unredacted.origin_server_ts.get().into()),
73+
self.sender_field => unredacted.sender.to_string(),
7974
))
8075
}
81-
82-
fn parse_any_event(
83-
&self,
84-
event: AnyMessageLikeEvent,
85-
) -> Result<(OwnedEventId, String, MilliSecondsSinceUnixEpoch, OwnedUserId), IndexError> {
86-
match event {
87-
// old m.room.message behaviour
88-
AnyMessageLikeEvent::RoomMessage(event) => {
89-
self.parse_event(event, |content| match &content.msgtype {
90-
MessageType::Text(content) => Ok(content.body.clone()),
91-
_ => Err(IndexError::MessageTypeNotSupported),
92-
})
93-
}
94-
95-
// new m.message behaviour
96-
AnyMessageLikeEvent::Message(event) => self.parse_event(event, |content| {
97-
content.text.find_plain().ok_or(IndexError::EmptyMessage).map(|v| v.to_owned())
98-
}),
99-
100-
_ => Err(IndexError::MessageTypeNotSupported),
101-
}
102-
}
10376
}
10477

10578
impl MatrixSearchIndexSchema for RoomMessageSchema {
@@ -140,17 +113,28 @@ impl MatrixSearchIndexSchema for RoomMessageSchema {
140113
self.inner.clone()
141114
}
142115

143-
fn make_doc(&self, event: AnyMessageLikeEvent) -> Result<TantivyDocument, IndexError> {
144-
let (event_id, body, timestamp, sender) = self.parse_any_event(event)?;
116+
fn handle_event(
117+
&self,
118+
event: AnySyncMessageLikeEvent,
119+
) -> Result<RoomIndexOperation, IndexError> {
120+
match event {
121+
// m.room.message behaviour
122+
AnySyncMessageLikeEvent::RoomMessage(event) => self
123+
.make_doc(event, |content| match &content.msgtype {
124+
MessageType::Text(content) => Ok(content.body.clone()),
125+
_ => Err(IndexError::MessageTypeNotSupported),
126+
})
127+
.map(RoomIndexOperation::Add),
145128

146-
Ok(doc!(
147-
self.event_id_field => event_id.to_string(),
148-
self.body_field => body,
149-
self.date_field =>
150-
DateTime::from_timestamp_millis(
151-
timestamp.get().into()),
152-
self.sender_field => sender.to_string(),
153-
))
129+
// new MSC-1767 m.message behaviour
130+
AnySyncMessageLikeEvent::Message(event) => self
131+
.make_doc(event, |content| {
132+
content.text.find_plain().ok_or(IndexError::EmptyMessage).map(|v| v.to_owned())
133+
})
134+
.map(RoomIndexOperation::Add),
135+
136+
_ => Err(IndexError::MessageTypeNotSupported),
137+
}
154138
}
155139
}
156140

crates/matrix-sdk-search/src/writer.rs

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,6 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! The event cache is an abstraction layer, sitting between the Rust SDK and a
16-
//! final client, that acts as a global observer of all the rooms, gathering and
17-
//! inferring some extra useful information about each room. In particular, this
18-
//! doesn't require subscribing to a specific room to get access to this
19-
//! information.
20-
//!
21-
//! It's intended to be fast, robust and easy to maintain, having learned from
22-
//! previous endeavours at implementing middle to high level features elsewhere
23-
//! in the SDK, notably in the UI's Timeline object.
24-
//!
25-
//! See the [github issue](https://github.com/matrix-org/matrix-rust-sdk/issues/3058) for more
26-
//! details about the historical reasons that led us to start writing this.
27-
2815
use tantivy::{IndexWriter, TantivyDocument, TantivyError};
2916

3017
use crate::{OpStamp, error::IndexError};

crates/matrix-sdk/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ docsrs = ["e2e-encryption", "sqlite", "indexeddb", "sso-login", "qrcode"]
7171
# Add support for inline media galleries via msgtypes
7272
unstable-msc4274 = ["ruma/unstable-msc4274", "matrix-sdk-base/unstable-msc4274"]
7373

74+
experimental-search = ["matrix-sdk-search"]
75+
7476
[dependencies]
7577
anyhow = { workspace = true, optional = true }
7678
anymap2 = "0.13.0"
@@ -99,6 +101,7 @@ matrix-sdk-base.workspace = true
99101
matrix-sdk-common.workspace = true
100102
matrix-sdk-ffi-macros = { workspace = true, optional = true }
101103
matrix-sdk-indexeddb = { workspace = true, optional = true }
104+
matrix-sdk-search = { workspace = true, optional = true }
102105
matrix-sdk-sqlite = { workspace = true, optional = true }
103106
matrix-sdk-test = { workspace = true, optional = true }
104107
mime.workspace = true

0 commit comments

Comments
 (0)