RUST-246 Batch large inserts (#106)

patrickfreed · web-flow · commit 7996d81137c8 · 2019-12-20T10:57:43.000-05:00
diff --git a/src/bson_util.rs b/src/bson_util.rs
@@ -107,3 +107,146 @@ pub(crate) fn serialize_batch_size<S: Serializer>(
         )),
     }
 }
+
+pub fn doc_size_bytes(doc: &Document) -> usize {
+    // 
+    // * i32 length prefix (4 bytes)
+    // * for each element:
+    //   * type (1 byte)
+    //   * number of UTF-8 bytes in key
+    //   * null terminator for the key (1 byte)
+    //   * size of the value
+    // * null terminator (1 byte)
+    4 + doc
+        .into_iter()
+        .map(|(key, val)| 1 + key.len() + 1 + size_bytes(val))
+        .sum::<usize>()
+        + 1
+}
+
+pub fn size_bytes(val: &Bson) -> usize {
+    match val {
+        Bson::FloatingPoint(_) => 8,
+        // 
+        // * length prefix (4 bytes)
+        // * number of UTF-8 bytes
+        // * null terminator (1 byte)
+        Bson::String(s) => 4 + s.len() + 1,
+        // An array is serialized as a document with the keys "0", "1", "2", etc., so the size of
+        // an array is:
+        //
+        // * length prefix (4 bytes)
+        // * for each element:
+        //   * type (1 byte)
+        //   * number of decimal digits in key
+        //   * null terminator for the key (1 byte)
+        //   * size of value
+        // * null terminator (1 byte)
+        Bson::Array(arr) => {
+            4 + arr
+                .iter()
+                .enumerate()
+                .map(|(i, val)| 1 + num_decimal_digits(i) + 1 + size_bytes(val))
+                .sum::<usize>()
+                + 1
+        }
+        Bson::Document(doc) => doc_size_bytes(doc),
+        Bson::Boolean(_) => 1,
+        Bson::Null => 0,
+        // for $pattern and $opts:
+        //   * number of UTF-8 bytes
+        //   * null terminator (1 byte)
+        Bson::RegExp(pattern, opts) => pattern.len() + 1 + opts.len() + 1,
+        // 
+        // * length prefix (4 bytes)
+        // * number of UTF-8 bytes
+        // * null terminator (1 byte)
+        Bson::JavaScriptCode(code) => 4 + code.len() + 1,
+        // 
+        // * i32 length prefix (4 bytes)
+        // * i32 length prefix for code (4 bytes)
+        // * number of UTF-8 bytes in code
+        // * null terminator for code (1 byte)
+        // * length of document
+        Bson::JavaScriptCodeWithScope(code, scope) => {
+            4 + 4 + code.len() + 1 + doc_size_bytes(scope)
+        }
+        Bson::I32(_) => 4,
+        Bson::I64(_) => 8,
+        Bson::TimeStamp(_) => 8,
+        // 
+        // * i32 length prefix (4 bytes)
+        // * subtype (1 byte)
+        // * number of bytes
+        Bson::Binary(_, bytes) => 4 + 1 + bytes.len(),
+        Bson::ObjectId(_) => 12,
+        Bson::UtcDatetime(_) => 8,
+        // 
+        // * i32 length prefix (4 bytes)
+        // * subtype (1 byte)
+        // * number of UTF-8 bytes
+        Bson::Symbol(s) => 4 + 1 + s.len(),
+    }
+}
+
+fn num_decimal_digits(n: usize) -> usize {
+    let mut digits = 1;
+    let mut curr = 10;
+
+    while curr < n {
+        curr = match curr.checked_mul(10) {
+            Some(val) => val,
+            None => break,
+        };
+
+        digits += 1;
+    }
+
+    digits
+}
+
+#[cfg(test)]
+mod test {
+    use bson::{bson, doc, oid::ObjectId, spec::BinarySubtype, Bson};
+    use chrono::{DateTime, NaiveDateTime, Utc};
+
+    use super::doc_size_bytes;
+
+    #[test]
+    fn doc_size_bytes_eq_serialized_size_bytes() {
+        let doc = doc! {
+            "double": -12.3,
+            "string": "foo",
+            "array": ["foobar", -7, Bson::Null, Bson::TimeStamp(1278), false],
+            "document": {
+                "x": 1,
+                "yyz": "Rush is one of the greatest bands of all time",
+            },
+            "bool": true,
+            "null": Bson::Null,
+            "regex": Bson::RegExp("foobar".into(), "i".into()),
+            "code": Bson::JavaScriptCode("foo(x) { return x + 1; }".into()),
+            "code with scope": Bson::JavaScriptCodeWithScope(
+                "foo(x) { return x + y; }".into(),
+                doc! { "y": -17 },
+            ),
+            "i32": 12i32,
+            "i64": -126i64,
+            "timestamp": Bson::TimeStamp(1223334444),
+            "binary": Bson::Binary(BinarySubtype::Generic, vec![3, 222, 11]),
+            "objectid": ObjectId::with_bytes([1; 12]),
+            "datetime": DateTime::from_utc(
+                NaiveDateTime::from_timestamp(4444333221, 0),
+                Utc,
+            ),
+            "symbol": Bson::Symbol("foobar".into()),
+        };
+
+        let size_bytes = doc_size_bytes(&doc);
+
+        let mut serialized_bytes = Vec::new();
+        bson::encode_document(&mut serialized_bytes, &doc).unwrap();
+
+        assert_eq!(size_bytes, serialized_bytes.len());
+    }
+}
diff --git a/src/coll/batch.rs b/src/coll/batch.rs
@@ -0,0 +1,82 @@
+// Splits off elements from `all` so that the sum of sizes in `all` is not greater than
+// `max_batch_size`. Any remaining elements will be returned in a separate vector.
+pub(crate) fn split_off_batch<T>(
+    all: &mut Vec<T>,
+    max_batch_size: usize,
+    get_size: impl Fn(&T) -> usize,
+) -> Vec<T> {
+    if all.is_empty() {
+        return Vec::new();
+    }
+
+    let mut batch_size = get_size(&all[0]);
+
+    for i in 1..all.len() {
+        let elem_size = get_size(&all[i]);
+
+        if batch_size + elem_size > max_batch_size {
+            return all.split_off(i);
+        }
+
+        batch_size += elem_size;
+    }
+
+    Vec::new()
+}
+
+#[cfg(test)]
+mod test {
+    use super::split_off_batch;
+
+    #[test]
+    fn split_empty_batch() {
+        let mut all: Vec<i32> = Vec::new();
+
+        assert!(split_off_batch(&mut all, 10, |_| 1).is_empty());
+    }
+
+    #[test]
+    fn split_single_batch() {
+        let mut all = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+        assert!(split_off_batch(&mut all, 10, |_| 1).is_empty());
+    }
+
+    #[test]
+    fn split_multi_batch() {
+        let mut all = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let rest = split_off_batch(&mut all, 3, |_| 1);
+
+        assert_eq!(all, vec![1, 2, 3]);
+        assert_eq!(rest, vec![4, 5, 6, 7, 8, 9, 10]);
+    }
+
+    #[test]
+    fn split_batches_until_empty() {
+        let mut batches = Vec::new();
+        let mut all = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+        loop {
+            let batch = split_off_batch(&mut all, 3, |_| 1);
+            if batch.is_empty() {
+                break;
+            }
+            batches.push(std::mem::replace(&mut all, batch));
+        }
+
+        assert_eq!(all, vec![10]);
+        assert_eq!(batches.len(), 3);
+        assert_eq!(batches[0], vec![1, 2, 3]);
+        assert_eq!(batches[1], vec![4, 5, 6]);
+        assert_eq!(batches[2], vec![7, 8, 9]);
+    }
+
+    #[test]
+    fn split_batch_with_too_large_element() {
+        let mut all = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let rest = split_off_batch(&mut all, 3, |_| 5);
+
+        assert_eq!(all, vec![1]);
+        assert_eq!(rest, vec![2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    }
+}
diff --git a/src/coll/mod.rs b/src/coll/mod.rs
@@ -1,3 +1,4 @@
+mod batch;
 pub mod options;
 
 use std::{fmt, sync::Arc};
@@ -9,7 +10,7 @@ use self::options::*;
 use crate::{
     bson_util,
     concern::{ReadConcern, WriteConcern},
-    error::{convert_bulk_errors, ErrorKind, Result},
+    error::{convert_bulk_errors, BulkWriteError, BulkWriteFailure, ErrorKind, Result},
     operation::{
         Aggregate,
         Count,
@@ -28,6 +29,11 @@ use crate::{
     Database,
 };
 
+/// Maximum size in bytes of an insert batch.
+/// This is intentionally less than the actual max document size, which is 16*1024*1024 bytes, to
+/// allow for overhead in the command document.
+const MAX_INSERT_DOCS_BYTES: usize = 16 * 1000 * 1000;
+
 /// `Collection` is the client-side abstraction of a MongoDB Collection. It can be used to
 /// perform collection-level operations such as CRUD operations. A `Collection` can be obtained
 /// through a [`Database`](struct.Database.html) by calling either
@@ -382,8 +388,77 @@ impl Collection {
         let mut options = options.into();
         resolve_options!(self, options, [write_concern]);
 
-        let insert = Insert::new(self.namespace(), docs.into_iter().collect(), options);
-        self.client().execute_operation(&insert, None)
+        let mut docs: Vec<Document> = docs.into_iter().collect();
+
+        if docs.is_empty() {
+            return Err(ErrorKind::ArgumentError {
+                message: "No documents provided to insert_many".to_string(),
+            }
+            .into());
+        }
+
+        let ordered = options.as_ref().and_then(|o| o.ordered).unwrap_or(true);
+
+        let mut cumulative_failure: Option<BulkWriteFailure> = None;
+        let mut cumulative_result: Option<InsertManyResult> = None;
+
+        let mut n_attempted = 0;
+
+        while !docs.is_empty() {
+            let mut remaining_docs =
+                batch::split_off_batch(&mut docs, MAX_INSERT_DOCS_BYTES, bson_util::doc_size_bytes);
+            std::mem::swap(&mut docs, &mut remaining_docs);
+            let current_batch = remaining_docs;
+
+            let current_batch_size = current_batch.len();
+            n_attempted += current_batch_size;
+
+            let insert = Insert::new(self.namespace(), current_batch, options.clone());
+            match self.client().execute_operation(&insert, None) {
+                Ok(result) => {
+                    if cumulative_failure.is_none() {
+                        let cumulative_result =
+                            cumulative_result.get_or_insert_with(InsertManyResult::new);
+                        for (index, id) in result.inserted_ids {
+                            cumulative_result
+                                .inserted_ids
+                                .insert(index + n_attempted - current_batch_size, id);
+                        }
+                    }
+                }
+                Err(e) => match e.kind.as_ref() {
+                    ErrorKind::BulkWriteError(failure) => {
+                        let failure_ref =
+                            cumulative_failure.get_or_insert_with(BulkWriteFailure::new);
+                        if let Some(ref write_errors) = failure.write_errors {
+                            failure_ref
+                                .write_errors
+                                .get_or_insert_with(Default::default)
+                                .extend(write_errors.iter().map(|error| BulkWriteError {
+                                    index: error.index + n_attempted - current_batch_size,
+                                    ..error.clone()
+                                }));
+                        }
+                        if let Some(ref write_concern_error) = failure.write_concern_error {
+                            failure_ref.write_concern_error = Some(write_concern_error.clone());
+                        }
+
+                        if ordered {
+                            return Err(ErrorKind::BulkWriteError(
+                                cumulative_failure.unwrap_or_else(BulkWriteFailure::new),
+                            )
+                            .into());
+                        }
+                    }
+                    _ => return Err(e),
+                },
+            }
+        }
+
+        match cumulative_failure {
+            Some(failure) => Err(ErrorKind::BulkWriteError(failure).into()),
+            None => Ok(cumulative_result.unwrap_or_else(InsertManyResult::new)),
+        }
     }
 
     /// Inserts `doc` into the collection.
diff --git a/src/error.rs b/src/error.rs
@@ -305,7 +305,7 @@ pub struct WriteError {
 #[derive(Debug, PartialEq, Clone, Deserialize)]
 pub struct BulkWriteError {
     /// Index into the list of operations that this error corresponds to.
-    pub index: i32,
+    pub index: usize,
 
     /// Identifies the type of write concern error.
     pub code: i32,
@@ -332,6 +332,15 @@ pub struct BulkWriteFailure {
     pub write_concern_error: Option<WriteConcernError>,
 }
 
+impl BulkWriteFailure {
+    pub(crate) fn new() -> Self {
+        BulkWriteFailure {
+            write_errors: None,
+            write_concern_error: None,
+        }
+    }
+}
+
 /// An error that occurred when trying to execute a write operation.
 #[derive(Clone, Debug)]
 pub enum WriteFailure {
diff --git a/src/results.rs b/src/results.rs
@@ -32,6 +32,14 @@ pub struct InsertManyResult {
     pub inserted_ids: HashMap<usize, Bson>,
 }
 
+impl InsertManyResult {
+    pub(crate) fn new() -> Self {
+        InsertManyResult {
+            inserted_ids: HashMap::new(),
+        }
+    }
+}
+
 /// The result of a [`Collection::update_one`](../struct.Collection.html#method.update_one) or
 /// [`Collection::update_many`](../struct.Collection.html#method.update_many) operation.
 #[derive(Debug)]
diff --git a/src/test/coll.rs b/src/test/coll.rs