Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit fc0e738

Browse files
committed
Fix hard-deletion of an external id that was soft-deleted
1 parent 97fb64e commit fc0e738

File tree

3 files changed

+116
-154
lines changed

3 files changed

+116
-154
lines changed

milli/src/external_documents_ids.rs

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> {
4747
}
4848
}
4949

50-
pub fn delete_ids<A: AsRef<[u8]>>(&mut self, other: fst::Set<A>) -> fst::Result<()> {
51-
let other = fst::Map::from(other.into_fst());
52-
let union_op = self.soft.op().add(&other).r#union();
53-
54-
let mut iter = union_op.into_stream();
55-
let mut new_soft_builder = fst::MapBuilder::memory();
56-
while let Some((external_id, docids)) = iter.next() {
57-
if docids.iter().any(|v| v.index == 1) {
58-
// If the `other` set returns a value here it means
59-
// that it must be marked as deleted.
60-
new_soft_builder.insert(external_id, DELETED_ID)?;
61-
} else {
62-
let value = docids.iter().find(|v| v.index == 0).unwrap().value;
63-
new_soft_builder.insert(external_id, value)?;
64-
}
65-
}
66-
67-
drop(iter);
68-
69-
// We save this new map as the new soft map.
70-
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
71-
self.merge_soft_into_hard()
72-
}
73-
7450
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
7551
/// don't contain any soft deleted document id.
7652
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
@@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> {
173149
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
174150
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
175151
}
176-
177-
#[cfg(test)]
178-
mod tests {
179-
use super::*;
180-
181-
#[test]
182-
fn simple_insert_delete_ids() {
183-
let mut external_documents_ids = ExternalDocumentsIds::default();
184-
185-
let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap();
186-
external_documents_ids.insert_ids(&new_ids).unwrap();
187-
188-
assert_eq!(external_documents_ids.get("a"), Some(1));
189-
assert_eq!(external_documents_ids.get("b"), Some(2));
190-
assert_eq!(external_documents_ids.get("c"), Some(3));
191-
assert_eq!(external_documents_ids.get("d"), Some(4));
192-
193-
let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap();
194-
external_documents_ids.insert_ids(&new_ids).unwrap();
195-
196-
assert_eq!(external_documents_ids.get("a"), Some(1));
197-
assert_eq!(external_documents_ids.get("b"), Some(2));
198-
assert_eq!(external_documents_ids.get("c"), Some(3));
199-
assert_eq!(external_documents_ids.get("d"), Some(4));
200-
assert_eq!(external_documents_ids.get("e"), Some(5));
201-
assert_eq!(external_documents_ids.get("f"), Some(6));
202-
assert_eq!(external_documents_ids.get("g"), Some(7));
203-
204-
let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap();
205-
external_documents_ids.delete_ids(del_ids).unwrap();
206-
207-
assert_eq!(external_documents_ids.get("a"), None);
208-
assert_eq!(external_documents_ids.get("b"), Some(2));
209-
assert_eq!(external_documents_ids.get("c"), None);
210-
assert_eq!(external_documents_ids.get("d"), Some(4));
211-
assert_eq!(external_documents_ids.get("e"), Some(5));
212-
assert_eq!(external_documents_ids.get("f"), None);
213-
assert_eq!(external_documents_ids.get("g"), Some(7));
214-
215-
let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap();
216-
external_documents_ids.insert_ids(&new_ids).unwrap();
217-
218-
assert_eq!(external_documents_ids.get("a"), Some(5));
219-
assert_eq!(external_documents_ids.get("b"), Some(6));
220-
assert_eq!(external_documents_ids.get("c"), None);
221-
assert_eq!(external_documents_ids.get("d"), Some(4));
222-
assert_eq!(external_documents_ids.get("e"), Some(5));
223-
assert_eq!(external_documents_ids.get("f"), None);
224-
assert_eq!(external_documents_ids.get("g"), Some(7));
225-
assert_eq!(external_documents_ids.get("h"), Some(8));
226-
}
227-
228-
#[test]
229-
fn strange_delete_insert_ids() {
230-
let mut external_documents_ids = ExternalDocumentsIds::default();
231-
232-
let new_ids =
233-
fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap();
234-
external_documents_ids.insert_ids(&new_ids).unwrap();
235-
assert_eq!(external_documents_ids.get("1"), Some(0));
236-
assert_eq!(external_documents_ids.get("123"), Some(1));
237-
assert_eq!(external_documents_ids.get("30"), Some(2));
238-
assert_eq!(external_documents_ids.get("456"), Some(3));
239-
240-
let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap();
241-
external_documents_ids.delete_ids(deleted_ids).unwrap();
242-
assert_eq!(external_documents_ids.get("30"), None);
243-
244-
let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap();
245-
external_documents_ids.insert_ids(&new_ids).unwrap();
246-
assert_eq!(external_documents_ids.get("30"), Some(2));
247-
}
248-
}

milli/src/index.rs

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,7 @@ impl Index {
11811181

11821182
#[cfg(test)]
11831183
pub(crate) mod tests {
1184+
use std::collections::HashSet;
11841185
use std::ops::Deref;
11851186

11861187
use big_s::S;
@@ -1195,7 +1196,7 @@ pub(crate) mod tests {
11951196
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
11961197
IndexDocumentsMethod, IndexerConfig, Settings,
11971198
};
1198-
use crate::{db_snap, obkv_to_json, Index};
1199+
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
11991200

12001201
pub(crate) struct TempIndex {
12011202
pub inner: Index,
@@ -2188,4 +2189,97 @@ pub(crate) mod tests {
21882189
"###);
21892190
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
21902191
}
2192+
2193+
#[test]
2194+
fn bug_3021_fourth() {
2195+
// https://github.com/meilisearch/meilisearch/issues/3021
2196+
let mut index = TempIndex::new();
2197+
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2198+
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
2199+
2200+
index
2201+
.update_settings(|settings| {
2202+
settings.set_primary_key("primary_key".to_owned());
2203+
})
2204+
.unwrap();
2205+
2206+
index
2207+
.add_documents(documents!([
2208+
{ "primary_key": 11 },
2209+
{ "primary_key": 4 },
2210+
]))
2211+
.unwrap();
2212+
2213+
db_snap!(index, documents_ids, @"[0, 1, ]");
2214+
db_snap!(index, external_documents_ids, @r###"
2215+
soft:
2216+
hard:
2217+
11 0
2218+
4 1
2219+
"###);
2220+
db_snap!(index, soft_deleted_documents_ids, @"[]");
2221+
2222+
index
2223+
.add_documents(documents!([
2224+
{ "primary_key": 4, "a": 0 },
2225+
{ "primary_key": 1 },
2226+
]))
2227+
.unwrap();
2228+
2229+
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
2230+
db_snap!(index, external_documents_ids, @r###"
2231+
soft:
2232+
hard:
2233+
1 3
2234+
11 0
2235+
4 2
2236+
"###);
2237+
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
2238+
2239+
let mut wtxn = index.write_txn().unwrap();
2240+
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
2241+
delete.strategy(DeletionStrategy::AlwaysHard);
2242+
delete.execute().unwrap();
2243+
wtxn.commit().unwrap();
2244+
2245+
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
2246+
db_snap!(index, external_documents_ids, @r###"
2247+
soft:
2248+
hard:
2249+
1 3
2250+
11 0
2251+
4 2
2252+
"###);
2253+
db_snap!(index, soft_deleted_documents_ids, @"[]");
2254+
2255+
index
2256+
.add_documents(documents!([
2257+
{ "primary_key": 4, "a": 1 },
2258+
{ "primary_key": 1, "a": 0 },
2259+
]))
2260+
.unwrap();
2261+
2262+
db_snap!(index, documents_ids, @"[0, 1, 4, ]");
2263+
db_snap!(index, external_documents_ids, @r###"
2264+
soft:
2265+
hard:
2266+
1 4
2267+
11 0
2268+
4 1
2269+
"###);
2270+
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
2271+
2272+
let rtxn = index.read_txn().unwrap();
2273+
let search = Search::new(&rtxn, &index);
2274+
let SearchResult { matching_words: _, candidates: _, mut documents_ids } =
2275+
search.execute().unwrap();
2276+
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
2277+
documents_ids.sort_unstable();
2278+
let docs = index.documents(&rtxn, documents_ids).unwrap();
2279+
let mut all_ids = HashSet::new();
2280+
for (_docid, obkv) in docs {
2281+
let id = obkv.get(primary_key_id).unwrap();
2282+
assert!(all_ids.insert(id));
2283+
}
2284+
}
21912285
}

milli/src/update/delete_documents.rs

Lines changed: 21 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str};
66
use heed::Database;
77
use roaring::RoaringBitmap;
88
use serde::{Deserialize, Serialize};
9-
use serde_json::Value;
109
use time::OffsetDateTime;
1110

1211
use super::facet::delete::FacetsDelete;
1312
use super::ClearDocuments;
14-
use crate::error::{InternalError, UserError};
13+
use crate::error::InternalError;
1514
use crate::facet::FacetType;
1615
use crate::heed_codec::facet::FieldDocIdFacetCodec;
1716
use crate::heed_codec::CboRoaringBitmapCodec;
18-
use crate::index::{db_name, main_key};
1917
use crate::{
2018
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
2119
SmallString32, BEU32,
@@ -186,6 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
186184

187185
soft_deleted_docids |= &self.to_delete_docids;
188186

187+
// We always soft-delete the documents, even if they will be permanently
188+
// deleted immediately after.
189+
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
190+
189191
// decide for a hard or soft deletion depending on the strategy
190192
let soft_deletion = match self.strategy {
191193
DeletionStrategy::Dynamic => {
@@ -214,31 +216,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
214216

215217
if soft_deletion {
216218
// Keep the soft-deleted in the DB
217-
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
218219
return Ok(DetailedDocumentDeletionResult {
219220
deleted_documents: self.to_delete_docids.len(),
220221
remaining_documents: documents_ids.len(),
221222
soft_deletion_used: true,
222223
});
223224
}
224225

225-
// Erase soft-deleted from DB
226226
self.to_delete_docids = soft_deleted_docids;
227-
// and we can reset the soft deleted bitmap
228-
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
229-
230-
let primary_key =
231-
self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry {
232-
db_name: db_name::MAIN,
233-
key: Some(main_key::PRIMARY_KEY_KEY),
234-
})?;
235-
236-
// Since we already checked if the DB was empty, if we can't find the primary key, then
237-
// something is wrong, and we must return an error.
238-
let id_field = match fields_ids_map.id(primary_key) {
239-
Some(field) => field,
240-
None => return Err(UserError::MissingPrimaryKey.into()),
241-
};
242227

243228
let Index {
244229
env: _env,
@@ -262,33 +247,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
262247
documents,
263248
} = self.index;
264249

265-
// Retrieve the words and the external documents ids contained in the documents.
250+
// Retrieve the words contained in the documents.
266251
let mut words = Vec::new();
267-
let mut external_ids = Vec::new();
268252
for docid in &self.to_delete_docids {
269-
// We create an iterator to be able to get the content and delete the document
270-
// content itself. It's faster to acquire a cursor to get and delete,
271-
// as we avoid traversing the LMDB B-Tree two times but only once.
272-
let key = BEU32::new(docid);
273-
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
274-
if let Some((_key, obkv)) = iter.next().transpose()? {
275-
if let Some(content) = obkv.get(id_field) {
276-
let external_id = match serde_json::from_slice(content).unwrap() {
277-
Value::String(string) => SmallString32::from(string.as_str()),
278-
Value::Number(number) => SmallString32::from(number.to_string()),
279-
document_id => {
280-
return Err(UserError::InvalidDocumentId { document_id }.into())
281-
}
282-
};
283-
external_ids.push(external_id);
284-
}
285-
// safety: we don't keep references from inside the LMDB database.
286-
unsafe { iter.del_current()? };
287-
}
288-
drop(iter);
253+
documents.delete(self.wtxn, &BEU32::new(docid))?;
289254

290-
// We iterate through the words positions of the document id,
291-
// retrieve the word and delete the positions.
255+
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
256+
// We create an iterator to be able to get the content and delete the key-value itself.
257+
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
292258
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
293259
while let Some(result) = iter.next() {
294260
let ((_docid, word), _positions) = result?;
@@ -298,17 +264,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
298264
unsafe { iter.del_current()? };
299265
}
300266
}
301-
302-
// We create the FST map of the external ids that we must delete.
303-
external_ids.sort_unstable();
304-
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
305-
306267
// We acquire the current external documents ids map...
268+
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
307269
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
308-
// ...and remove the to-delete external ids.
309-
new_external_documents_ids.delete_ids(external_ids_to_delete)?;
310-
311-
// We write the new external ids into the main database.
270+
// We then remove the soft-deleted docids from it
271+
new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
272+
// and write it back to the main database.
312273
let new_external_documents_ids = new_external_documents_ids.into_static();
313274
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
314275

@@ -545,6 +506,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
545506
&self.to_delete_docids,
546507
)?;
547508

509+
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
510+
548511
Ok(DetailedDocumentDeletionResult {
549512
deleted_documents: self.to_delete_docids.len(),
550513
remaining_documents: documents_ids.len(),
@@ -1125,14 +1088,16 @@ mod tests {
11251088
id
11261089
);
11271090
}
1091+
wtxn.commit().unwrap();
1092+
1093+
let rtxn = index.read_txn().unwrap();
11281094

11291095
// get internal docids from deleted external document ids
1130-
let results = index.external_documents_ids(&wtxn).unwrap();
1096+
let results = index.external_documents_ids(&rtxn).unwrap();
11311097
for id in deleted_external_ids {
11321098
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
11331099
}
1134-
1135-
wtxn.commit().unwrap();
1100+
drop(rtxn);
11361101

11371102
db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
11381103
}

0 commit comments

Comments
 (0)