Skip to content

Commit 445c6b1

Browse files
authored
Sets default for store_source to false. (#1136)
store_source set to true makes a lot of sense... However we already set each individual field store attribute in the schema to true. Having both set to true is redundant and needlessly inflates the index size. Closes #1132
1 parent bf2d790 commit 445c6b1

File tree

3 files changed

+36
-11
lines changed

3 files changed

+36
-11
lines changed

docs/guides/tutorial-hdfs-logs.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ doc_mapping:
6969
type: text
7070
tokenizer: raw # Text field referenced as tag must have the `raw` tokenier.
7171
tag_fields: [tenant_id]
72+
store_source: false
7273

7374
indexing_settings:
7475
timestamp_field: timestamp
@@ -178,7 +179,7 @@ Now that we have indexed our dataset and can do a local search, let's show how e
178179
First, let's download the Searcher's configuration files:
179180

180181
```bash
181-
for i in {1..3}; do
182+
for i in {1..3}; do
182183
curl -o searcher-$i.yaml https://raw.githubusercontent.com/quickwit-oss/quickwit/main/config/tutorials/hdfs-logs/searcher-$i.yaml
183184
done
184185
```
@@ -227,7 +228,7 @@ Let's do some cleanup by deleting the index:
227228
```
228229

229230

230-
Congratz! You finished this tutorial!
231+
Congratz! You finished this tutorial!
231232

232233

233234
To continue your Quickwit journey, check out the [tutorial for distributed search](tutorial-hdfs-logs-distributed-search-aws-s3.md) or dig into the [search REST API](../reference/rest-api.md) or [query language](../reference/query-language.md).

quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use tantivy::Document;
3232
use tracing::info;
3333

3434
use super::field_mapping_entry::{DocParsingError, FieldPath};
35-
use super::{default_as_true, FieldMappingEntry, FieldMappingType};
35+
use super::{FieldMappingEntry, FieldMappingType};
3636
use crate::query_builder::build_query;
3737
use crate::sort_by::{SortBy, SortOrder};
3838
use crate::{DocMapper, QueryParserError, SOURCE_FIELD_NAME};
@@ -45,7 +45,7 @@ const RAW_TOKENIZER_NAME: &str = "raw";
4545
#[derive(Default, Serialize, Deserialize, Clone)]
4646
pub struct DefaultDocMapperBuilder {
4747
/// Stores the original source document when set to true.
48-
#[serde(default = "default_as_true")]
48+
#[serde(default)]
4949
pub store_source: bool,
5050
/// Name of the fields that are searched by default, unless overridden.
5151
pub default_search_fields: Vec<String>,
@@ -87,7 +87,7 @@ impl DefaultDocMapperBuilder {
8787
/// Create a new `DefaultDocMapperBuilder` for tests.
8888
pub fn new() -> Self {
8989
Self {
90-
store_source: true,
90+
store_source: false,
9191
default_search_fields: vec![],
9292
timestamp_field: None,
9393
sort_by: None,
@@ -755,12 +755,13 @@ mod tests {
755755
}
756756

757757
#[test]
758-
fn test_parse_document_with_tag_fields() -> anyhow::Result<()> {
758+
fn test_parse_document_with_tag_fields() {
759759
let doc_mapper = r#"{
760760
"type": "default",
761761
"default_search_fields": [],
762762
"timestamp_field": null,
763763
"tag_fields": ["city"],
764+
"store_source": true,
764765
"field_mappings": [
765766
{
766767
"name": "city",
@@ -776,14 +777,16 @@ mod tests {
776777
]
777778
}"#;
778779

779-
let builder = serde_json::from_str::<DefaultDocMapperBuilder>(doc_mapper)?;
780-
let doc_mapper = builder.build()?;
780+
let builder = serde_json::from_str::<DefaultDocMapperBuilder>(doc_mapper).unwrap();
781+
let doc_mapper = builder.build().unwrap();
781782
let schema = doc_mapper.schema();
782783
const JSON_DOC_VALUE: &str = r#"{
783784
"city": "tokio",
784785
"image": "YWJj"
785786
}"#;
786-
let document = doc_mapper.doc_from_json(JSON_DOC_VALUE.to_string())?;
787+
let document = doc_mapper
788+
.doc_from_json(JSON_DOC_VALUE.to_string())
789+
.unwrap();
787790

788791
// 2 properties, + 1 value for "_source"
789792
assert_eq!(document.len(), 3);
@@ -811,7 +814,6 @@ mod tests {
811814
assert!(is_value_in_expected_values);
812815
}
813816
});
814-
Ok(())
815817
}
816818

817819
#[test]
@@ -1036,4 +1038,26 @@ mod tests {
10361038
assert_eq!(builder.build().unwrap_err().to_string(), expected_msg);
10371039
Ok(())
10381040
}
1041+
1042+
// See #1132
1043+
#[test]
1044+
fn test_by_default_store_source_is_false_and_fields_are_stored_individually() {
1045+
let doc_mapper = r#"{
1046+
"default_search_fields": [],
1047+
"field_mappings": [
1048+
{
1049+
"name": "my-field",
1050+
"type": "u64",
1051+
"indexed": true
1052+
}
1053+
]
1054+
}"#;
1055+
let builder = serde_json::from_str::<DefaultDocMapperBuilder>(doc_mapper).unwrap();
1056+
let default_doc_mapper = builder.build().unwrap();
1057+
assert!(!default_doc_mapper.store_source);
1058+
let schema = default_doc_mapper.schema();
1059+
let field = schema.get_field("my-field").unwrap();
1060+
let field_entry = schema.get_field_entry(field);
1061+
assert!(field_entry.is_stored());
1062+
}
10391063
}

quickwit-doc-mapper/src/doc_mapper.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ mod tests {
113113
}
114114

115115
#[test]
116-
fn test_sedeserialize_doc_mapper() -> anyhow::Result<()> {
116+
fn test_serdeserialize_doc_mapper() -> anyhow::Result<()> {
117117
let deserialized_default_doc_mapper =
118118
serde_json::from_str::<Box<dyn DocMapper>>(JSON_DEFAULT_DOC_MAPPER)?;
119119
let expected_default_doc_mapper = DefaultDocMapperBuilder::new().build()?;

0 commit comments

Comments
 (0)