fix: ingestion flow and generic flattening

nikhilsinhaparseable · nikhilsinhaparseable · commit adf87dd1780a · 2025-01-04T02:32:36.000-05:00
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
@@ -23,13 +23,12 @@ use anyhow::anyhow;
 use arrow_schema::Field;
 use bytes::Bytes;
 use chrono::{DateTime, NaiveDateTime, Utc};
-use itertools::Itertools;
 use serde_json::Value;
 
 use crate::{
     event::{
+        self,
         format::{self, EventFormat},
-        Event,
     },
     handlers::{
         http::{ingest::PostError, kinesis},
@@ -73,61 +72,174 @@ pub async fn push_logs(
     let custom_partition = STREAM_INFO.get_custom_partition(stream_name)?;
     let schema_version = STREAM_INFO.get_schema_version(stream_name)?;
     let body_val: Value = serde_json::from_slice(body)?;
-    let data = convert_array_to_object(
-        body_val,
-        time_partition.as_ref(),
-        time_partition_limit,
-        custom_partition.as_ref(),
-        schema_version,
-    )?;
 
-    for value in data {
-        let origin_size = serde_json::to_vec(&value).unwrap().len() as u64; // string length need not be the same as byte length
-        let parsed_timestamp = match time_partition.as_ref() {
-            Some(time_partition) => get_parsed_timestamp(&value, time_partition)?,
-            _ => Utc::now().naive_utc(),
-        };
-        let custom_partition_values = match custom_partition.as_ref() {
-            Some(custom_partition) => {
-                let custom_partitions = custom_partition.split(',').collect_vec();
-                get_custom_partition_values(&value, &custom_partitions)
+    let size: usize = body.len();
+    let mut parsed_timestamp = Utc::now().naive_utc();
+    if time_partition.is_none() {
+        if custom_partition.is_none() {
+            let size = size as u64;
+            create_process_record_batch(
+                stream_name,
+                req,
+                body_val,
+                static_schema_flag.as_ref(),
+                None,
+                parsed_timestamp,
+                &HashMap::new(),
+                size,
+                schema_version,
+            )
+            .await?;
+        } else {
+            let data = convert_array_to_object(
+                body_val,
+                None,
+                None,
+                custom_partition.as_ref(),
+                schema_version,
+            )?;
+            let custom_partition = custom_partition.unwrap();
+            let custom_partition_list = custom_partition.split(',').collect::<Vec<&str>>();
+
+            for value in data {
+                let custom_partition_values =
+                    get_custom_partition_values(&value, &custom_partition_list);
+
+                let size = value.to_string().into_bytes().len() as u64;
+                create_process_record_batch(
+                    stream_name,
+                    req,
+                    value,
+                    static_schema_flag.as_ref(),
+                    None,
+                    parsed_timestamp,
+                    &custom_partition_values,
+                    size,
+                    schema_version,
+                )
+                .await?;
             }
-            None => HashMap::new(),
-        };
-        let schema = STREAM_INFO
-            .read()
-            .unwrap()
-            .get(stream_name)
-            .ok_or(PostError::StreamNotFound(stream_name.to_owned()))?
-            .schema
-            .clone();
-        let (rb, is_first_event) = into_event_batch(
-            req,
-            &value,
-            schema,
-            static_schema_flag.as_ref(),
+        }
+    } else if custom_partition.is_none() {
+        let data = convert_array_to_object(
+            body_val,
+            time_partition.as_ref(),
+            time_partition_limit,
+            None,
+            schema_version,
+        )?;
+        for value in data {
+            parsed_timestamp = get_parsed_timestamp(&value, time_partition.as_ref().unwrap())?;
+            let size = value.to_string().into_bytes().len() as u64;
+            create_process_record_batch(
+                stream_name,
+                req,
+                value,
+                static_schema_flag.as_ref(),
+                time_partition.as_ref(),
+                parsed_timestamp,
+                &HashMap::new(),
+                size,
+                schema_version,
+            )
+            .await?;
+        }
+    } else {
+        let data = convert_array_to_object(
+            body_val,
             time_partition.as_ref(),
+            time_partition_limit,
+            custom_partition.as_ref(),
             schema_version,
         )?;
+        let custom_partition = custom_partition.unwrap();
+        let custom_partition_list = custom_partition.split(',').collect::<Vec<&str>>();
 
-        Event {
-            rb,
-            stream_name: stream_name.to_owned(),
-            origin_format: "json",
-            origin_size,
-            is_first_event,
-            parsed_timestamp,
-            time_partition: time_partition.clone(),
-            custom_partition_values,
-            stream_type: StreamType::UserDefined,
+        for value in data {
+            let custom_partition_values =
+                get_custom_partition_values(&value, &custom_partition_list);
+
+            parsed_timestamp = get_parsed_timestamp(&value, time_partition.as_ref().unwrap())?;
+            let size = value.to_string().into_bytes().len() as u64;
+            create_process_record_batch(
+                stream_name,
+                req,
+                value,
+                static_schema_flag.as_ref(),
+                time_partition.as_ref(),
+                parsed_timestamp,
+                &custom_partition_values,
+                size,
+                schema_version,
+            )
+            .await?;
         }
-        .process()
-        .await?;
     }
 
     Ok(())
 }
 
+#[allow(clippy::too_many_arguments)]
+pub async fn create_process_record_batch(
+    stream_name: &str,
+    req: &HttpRequest,
+    value: Value,
+    static_schema_flag: Option<&String>,
+    time_partition: Option<&String>,
+    parsed_timestamp: NaiveDateTime,
+    custom_partition_values: &HashMap<String, String>,
+    origin_size: u64,
+    schema_version: SchemaVersion,
+) -> Result<(), PostError> {
+    let (rb, is_first_event) = get_stream_schema(
+        stream_name,
+        req,
+        &value,
+        static_schema_flag,
+        time_partition,
+        schema_version,
+    )?;
+    event::Event {
+        rb,
+        stream_name: stream_name.to_owned(),
+        origin_format: "json",
+        origin_size,
+        is_first_event,
+        parsed_timestamp,
+        time_partition: time_partition.cloned(),
+        custom_partition_values: custom_partition_values.clone(),
+        stream_type: StreamType::UserDefined,
+    }
+    .process()
+    .await?;
+
+    Ok(())
+}
+
+pub fn get_stream_schema(
+    stream_name: &str,
+    req: &HttpRequest,
+    body: &Value,
+    static_schema_flag: Option<&String>,
+    time_partition: Option<&String>,
+    schema_version: SchemaVersion,
+) -> Result<(arrow_array::RecordBatch, bool), PostError> {
+    let hash_map = STREAM_INFO.read().unwrap();
+    let schema = hash_map
+        .get(stream_name)
+        .ok_or(PostError::StreamNotFound(stream_name.to_owned()))?
+        .schema
+        .clone();
+    into_event_batch(
+        req,
+        body,
+        schema,
+        static_schema_flag,
+        time_partition,
+        schema_version,
+    )
+}
+
 pub fn into_event_batch(
     req: &HttpRequest,
     body: &Value,
diff --git a/src/utils/json/flatten.rs b/src/utils/json/flatten.rs
@@ -19,7 +19,6 @@
 use std::collections::BTreeMap;
 use std::num::NonZeroU32;
 
-use anyhow::anyhow;
 use chrono::{DateTime, Duration, Utc};
 use serde_json::map::Map;
 use serde_json::value::Value;
@@ -50,6 +49,8 @@ pub enum JsonFlattenError {
     ExpectedObjectInArray,
     #[error("Found non-object element while flattening array of objects")]
     NonObjectInArray,
+    #[error("heavily nested, cannot flatten this JSON")]
+    HeavilyNestedJson,
 }
 
 // Recursively flattens JSON objects and arrays, e.g. with the separator `.`, starting from the TOP
@@ -283,23 +284,25 @@ pub fn flatten_array_objects(
 /// 3. `[{"a": [{"b": 1}, {"c": 2}]}]` ~> `[{"a": {"b": 1)}}, {"a": {"c": 2)}}]`
 /// 4. `{"a": [{"b": 1}, {"c": 2}], "d": {"e": 4}}` ~> `[{"a": {"b":1}, "d": {"e":4}}, {"a": {"c":2}, "d": {"e":4}}]`
 /// 5. `{"a":{"b":{"c":{"d":{"e":["a","b"]}}}}}` ~> returns error - heavily nested, cannot flatten this JSON
-fn flattening_helper(value: &Value) -> Result<Vec<Value>, anyhow::Error> {
+pub fn generic_flattening(value: &Value) -> Result<Vec<Value>, JsonFlattenError> {
     if has_more_than_four_levels(value, 1) {
-        return Err(anyhow!("heavily nested, cannot flatten this JSON"));
+        return Err(JsonFlattenError::HeavilyNestedJson);
     }
 
     match value {
         Value::Array(arr) => Ok(arr
             .iter()
-            .flat_map(|flatten_item| flattening_helper(flatten_item).unwrap_or_default())
+            .flat_map(|flatten_item| generic_flattening(flatten_item).unwrap_or_default())
             .collect()),
         Value::Object(map) => {
             let results = map
                 .iter()
                 .fold(vec![Map::new()], |results, (key, val)| match val {
                     Value::Array(arr) => arr
                         .iter()
-                        .flat_map(|flatten_item| flattening_helper(flatten_item).unwrap_or_default())
+                        .flat_map(|flatten_item| {
+                            generic_flattening(flatten_item).unwrap_or_default()
+                        })
                         .flat_map(|flattened_item| {
                             results.iter().map(move |result| {
                                 let mut new_obj = result.clone();
@@ -308,7 +311,7 @@ fn flattening_helper(value: &Value) -> Result<Vec<Value>, anyhow::Error> {
                             })
                         })
                         .collect(),
-                    Value::Object(_) => flattening_helper(val)
+                    Value::Object(_) => generic_flattening(val)
                         .unwrap_or_default()
                         .iter()
                         .flat_map(|nested_result| {
@@ -355,20 +358,19 @@ fn has_more_than_four_levels(value: &Value, current_level: usize) -> bool {
 }
 
 // Converts a Vector of values into a `Value::Array`, as long as all of them are objects
-pub fn generic_flattening(json: Value) -> Result<Value, JsonFlattenError> {
-    let mut flattened = Vec::new();
-    for item in flattening_helper(&json).unwrap_or_default() {
+pub fn convert_to_array(flattened: Vec<Value>) -> Result<Value, JsonFlattenError> {
+    let mut result = Vec::new();
+    for item in flattened {
         let mut map = Map::new();
         let Some(item) = item.as_object() else {
             return Err(JsonFlattenError::ExpectedObjectInArray);
         };
         for (key, value) in item {
             map.insert(key.clone(), value.clone());
         }
-        flattened.push(Value::Object(map));
+        result.push(Value::Object(map));
     }
-
-    Ok(Value::Array(flattened))
+    Ok(Value::Array(result))
 }
 
 #[cfg(test)]
@@ -649,13 +651,13 @@ mod tests {
     #[test]
     fn flatten_json_success() {
         let value = json!({"a":{"b":{"e":["a","b"]}}});
-        let expected = json!([{"a":{"b":{"e":"a"}}},{"a":{"b":{"e":"b"}}}]);
-        assert_eq!(generic_flattening(value).unwrap(), expected);
+        let expected = vec![json!({"a":{"b":{"e":"a"}}}), json!({"a":{"b":{"e":"b"}}})];
+        assert_eq!(generic_flattening(&value).unwrap(), expected);
     }
 
     #[test]
     fn flatten_json_error() {
         let value = json!({"a":{"b":{"c":{"d":{"e":["a","b"]}}}}});
-        assert!(generic_flattening(value).is_err());
+        assert!(generic_flattening(&value).is_err());
     }
 }
diff --git a/src/utils/json/mod.rs b/src/utils/json/mod.rs
@@ -37,7 +37,9 @@ pub fn flatten_json_body(
     validation_required: bool,
 ) -> Result<Value, anyhow::Error> {
     let mut nested_value = if schema_version == SchemaVersion::V1 {
-        flatten::generic_flattening(body)?
+        flatten::generic_flattening(&body)
+            .map(flatten::convert_to_array)
+            .unwrap_or(Ok(body))?
     } else {
         body
     };
@@ -105,7 +107,15 @@ mod tests {
         let value = json!({"a":{"b":{"e":["a","b"]}}});
         let expected = json!([{"a_b_e": "a"}, {"a_b_e": "b"}]);
         assert_eq!(
-            flatten_json_body(value, None, None, None, crate::metadata::SchemaVersion::V1, false).unwrap(),
+            flatten_json_body(
+                value,
+                None,
+                None,
+                None,
+                crate::metadata::SchemaVersion::V1,
+                false
+            )
+            .unwrap(),
             expected
         );
     }
@@ -115,7 +125,15 @@ mod tests {
         let value = json!({"a":{"b":{"c":{"d":{"e":["a","b"]}}}}});
         let expected = json!({"a_b_c_d_e": ["a","b"]});
         assert_eq!(
-            flatten_json_body(value, None, None, None,crate::metadata::SchemaVersion::V1, false).unwrap(),
+            flatten_json_body(
+                value,
+                None,
+                None,
+                None,
+                crate::metadata::SchemaVersion::V1,
+                false
+            )
+            .unwrap(),
             expected
         );
     }