Skip to content

Commit 1217ed6

Browse files
authored
fix: preserve exact text for unquoted string values (#62(Fixes #59, #60, #61))
Fixes #59, #60, #61
1 parent c22f0ae commit 1217ed6

File tree

2 files changed

+282
-106
lines changed

2 files changed

+282
-106
lines changed

src/decode/parser.rs

Lines changed: 205 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,34 @@ impl<'a> Parser<'a> {
149149
self.advance()?;
150150
self.parse_object_with_initial_key(key, depth)
151151
} else {
152+
let first_text = self.scanner.last_token_text().to_string();
152153
let val = *i;
153154
self.advance()?;
154-
Ok(serde_json::Number::from(val).into())
155+
// Check if followed by more value tokens on the same line
156+
match &self.current_token {
157+
Token::String(..)
158+
| Token::Integer(..)
159+
| Token::Number(..)
160+
| Token::Bool(..)
161+
| Token::Null => {
162+
let mut accumulated = first_text;
163+
while let Token::String(..)
164+
| Token::Integer(..)
165+
| Token::Number(..)
166+
| Token::Bool(..)
167+
| Token::Null = &self.current_token
168+
{
169+
let ws = self.scanner.last_whitespace_count().max(1);
170+
for _ in 0..ws {
171+
accumulated.push(' ');
172+
}
173+
accumulated.push_str(self.scanner.last_token_text());
174+
self.advance()?;
175+
}
176+
Ok(Value::String(accumulated))
177+
}
178+
_ => Ok(serde_json::Number::from(val).into()),
179+
}
155180
}
156181
}
157182
Token::Number(n) => {
@@ -161,17 +186,45 @@ impl<'a> Parser<'a> {
161186
self.advance()?;
162187
self.parse_object_with_initial_key(key, depth)
163188
} else {
189+
let first_text = self.scanner.last_token_text().to_string();
164190
let val = *n;
165191
self.advance()?;
166-
// Normalize floats that are actually integers
167-
if val.is_finite() && val.fract() == 0.0 && val.abs() <= i64::MAX as f64 {
168-
Ok(serde_json::Number::from(val as i64).into())
169-
} else {
170-
Ok(serde_json::Number::from_f64(val)
171-
.ok_or_else(|| {
172-
ToonError::InvalidInput(format!("Invalid number: {val}"))
173-
})?
174-
.into())
192+
// Check if followed by more value tokens on the same line
193+
match &self.current_token {
194+
Token::String(..)
195+
| Token::Integer(..)
196+
| Token::Number(..)
197+
| Token::Bool(..)
198+
| Token::Null => {
199+
let mut accumulated = first_text;
200+
while let Token::String(..)
201+
| Token::Integer(..)
202+
| Token::Number(..)
203+
| Token::Bool(..)
204+
| Token::Null = &self.current_token
205+
{
206+
let ws = self.scanner.last_whitespace_count().max(1);
207+
for _ in 0..ws {
208+
accumulated.push(' ');
209+
}
210+
accumulated.push_str(self.scanner.last_token_text());
211+
self.advance()?;
212+
}
213+
Ok(Value::String(accumulated))
214+
}
215+
_ => {
216+
// Normalize floats that are actually integers
217+
if val.is_finite() && val.fract() == 0.0 && val.abs() <= i64::MAX as f64
218+
{
219+
Ok(serde_json::Number::from(val as i64).into())
220+
} else {
221+
Ok(serde_json::Number::from_f64(val)
222+
.ok_or_else(|| {
223+
ToonError::InvalidInput(format!("Invalid number: {val}"))
224+
})?
225+
.into())
226+
}
227+
}
175228
}
176229
}
177230
}
@@ -197,13 +250,22 @@ impl<'a> Parser<'a> {
197250
));
198251
}
199252

200-
// Root-level string value - join consecutive tokens
253+
if matches!(self.current_token, Token::Newline | Token::Eof) {
254+
return Ok(Value::String(first));
255+
}
256+
// Root-level string value - join consecutive tokens with exact spacing
201257
let mut accumulated = first;
202-
while let Token::String(next, _) = &self.current_token {
203-
if !accumulated.is_empty() {
258+
while let Token::String(..)
259+
| Token::Integer(..)
260+
| Token::Number(..)
261+
| Token::Bool(..)
262+
| Token::Null = &self.current_token
263+
{
264+
let ws = self.scanner.last_whitespace_count().max(1);
265+
for _ in 0..ws {
204266
accumulated.push(' ');
205267
}
206-
accumulated.push_str(next);
268+
accumulated.push_str(self.scanner.last_token_text());
207269
self.advance()?;
208270
}
209271
Ok(Value::String(accumulated))
@@ -433,9 +495,10 @@ impl<'a> Parser<'a> {
433495
self.parse_value_with_depth(depth + 1)
434496
} else {
435497
// Check if there's more content after the current token
436-
let (rest, had_space) = self.scanner.read_rest_of_line_with_space_info();
498+
let token_text = self.scanner.last_token_text().to_string();
499+
let (rest, space_count) = self.scanner.read_rest_of_line_with_space_info();
437500

438-
let result = if rest.is_empty() {
501+
let result = if rest.is_empty() && space_count == 0 {
439502
// Single token - convert directly to avoid redundant parsing
440503
match &self.current_token {
441504
Token::String(s, _) => Ok(Value::String(s.clone())),
@@ -457,28 +520,24 @@ impl<'a> Parser<'a> {
457520
_ => Err(self.parse_error_with_context("Unexpected token after colon")),
458521
}
459522
} else {
460-
// Multi-token value - reconstruct and re-parse as complete string
461-
let mut value_str = String::new();
462-
463-
match &self.current_token {
464-
Token::String(s, true) => {
465-
// Quoted strings need quotes preserved for re-parsing
466-
value_str.push('"');
467-
value_str.push_str(&crate::utils::escape_string(s));
468-
value_str.push('"');
523+
// Multi-token value - reconstruct using original token text and re-parse
524+
let mut value_str = match &self.current_token {
525+
Token::String(_, true) => {
526+
// Quoted strings: use last_token_text which includes quotes
527+
token_text.clone()
469528
}
470-
Token::String(s, false) => value_str.push_str(s),
471-
Token::Integer(i) => value_str.push_str(&i.to_string()),
472-
Token::Number(n) => value_str.push_str(&n.to_string()),
473-
Token::Bool(b) => value_str.push_str(if *b { "true" } else { "false" }),
474-
Token::Null => value_str.push_str("null"),
529+
Token::String(_, false)
530+
| Token::Integer(_)
531+
| Token::Number(_)
532+
| Token::Bool(_)
533+
| Token::Null => token_text.clone(),
475534
_ => {
476535
return Err(self.parse_error_with_context("Unexpected token after colon"));
477536
}
478-
}
537+
};
479538

480-
// Only add space if there was whitespace in the original input
481-
if had_space {
539+
// Preserve exact spacing from the original input
540+
for _ in 0..space_count {
482541
value_str.push(' ');
483542
}
484543
value_str.push_str(&rest);
@@ -1112,71 +1171,65 @@ impl<'a> Parser<'a> {
11121171
}
11131172

11141173
fn parse_tabular_field_value(&mut self) -> ToonResult<Value> {
1115-
match &self.current_token {
1116-
Token::Null => {
1117-
self.advance()?;
1118-
Ok(Value::Null)
1119-
}
1120-
Token::Bool(b) => {
1121-
let val = *b;
1122-
self.advance()?;
1123-
Ok(Value::Bool(val))
1124-
}
1125-
Token::Integer(i) => {
1126-
let val = *i;
1127-
self.advance()?;
1128-
// If followed by string tokens, treat the whole value as a string
1129-
if let Token::String(..) = &self.current_token {
1130-
let mut accumulated = val.to_string();
1131-
while let Token::String(next, _) = &self.current_token {
1132-
accumulated.push(' ');
1133-
accumulated.push_str(next);
1134-
self.advance()?;
1135-
}
1136-
Ok(Value::String(accumulated))
1137-
} else {
1138-
Ok(Number::from(val).into())
1139-
}
1140-
}
1141-
Token::Number(n) => {
1142-
let val = *n;
1143-
self.advance()?;
1144-
// If followed by string tokens, treat the whole value as a string
1145-
if let Token::String(..) = &self.current_token {
1146-
let mut accumulated = val.to_string();
1147-
while let Token::String(next, _) = &self.current_token {
1148-
accumulated.push(' ');
1149-
accumulated.push_str(next);
1150-
self.advance()?;
1174+
// Get the original text of the current token
1175+
let token_text = self.scanner.last_token_text().to_string();
1176+
1177+
// Read remaining text until delimiter/newline/EOF
1178+
let (rest, space_count) = self.scanner.read_until_delimiter_with_space_info();
1179+
1180+
if rest.is_empty() && space_count == 0 {
1181+
// Single token — handle as primitive directly
1182+
let result = match &self.current_token {
1183+
Token::Null => Ok(Value::Null),
1184+
Token::Bool(b) => Ok(Value::Bool(*b)),
1185+
Token::Integer(i) => Ok(Number::from(*i).into()),
1186+
Token::Number(n) => {
1187+
let val = *n;
1188+
if val.is_finite() && val.fract() == 0.0 && val.abs() <= i64::MAX as f64 {
1189+
Ok(Number::from(val as i64).into())
1190+
} else {
1191+
Ok(Number::from_f64(val)
1192+
.ok_or_else(|| {
1193+
ToonError::InvalidInput(format!("Invalid number: {val}"))
1194+
})?
1195+
.into())
11511196
}
1152-
Ok(Value::String(accumulated))
1153-
} else if val.is_finite() && val.fract() == 0.0 && val.abs() <= i64::MAX as f64 {
1154-
Ok(Number::from(val as i64).into())
1155-
} else {
1156-
Ok(Number::from_f64(val)
1157-
.ok_or_else(|| ToonError::InvalidInput(format!("Invalid number: {val}")))?
1158-
.into())
11591197
}
1198+
Token::String(s, _) => Ok(Value::String(s.clone())),
1199+
_ => Err(self.parse_error_with_context(format!(
1200+
"Expected primitive value, found {:?}",
1201+
self.current_token
1202+
))),
1203+
};
1204+
self.advance()?;
1205+
result
1206+
} else {
1207+
// Multiple tokens — combine original text + spaces + rest, then type-infer
1208+
let mut value_str = token_text;
1209+
for _ in 0..space_count {
1210+
value_str.push(' ');
11601211
}
1161-
Token::String(s, _) => {
1162-
// Tabular fields can have multiple string tokens joined with spaces
1163-
let mut accumulated = s.clone();
1164-
self.advance()?;
1212+
value_str.push_str(&rest);
11651213

1166-
while let Token::String(next, _) = &self.current_token {
1167-
if !accumulated.is_empty() {
1168-
accumulated.push(' ');
1214+
let token = self.scanner.parse_value_string(&value_str)?;
1215+
// Rescan so current_token is positioned at the next delimiter/newline
1216+
self.current_token = self.scanner.scan_token()?;
1217+
match token {
1218+
Token::String(s, _) => Ok(Value::String(s)),
1219+
Token::Integer(i) => Ok(Number::from(i).into()),
1220+
Token::Number(n) => {
1221+
if n.is_finite() && n.fract() == 0.0 && n.abs() <= i64::MAX as f64 {
1222+
Ok(Number::from(n as i64).into())
1223+
} else {
1224+
Ok(Number::from_f64(n)
1225+
.ok_or_else(|| ToonError::InvalidInput(format!("Invalid number: {n}")))?
1226+
.into())
11691227
}
1170-
accumulated.push_str(next);
1171-
self.advance()?;
11721228
}
1173-
1174-
Ok(Value::String(accumulated))
1229+
Token::Bool(b) => Ok(Value::Bool(b)),
1230+
Token::Null => Ok(Value::Null),
1231+
_ => Err(ToonError::InvalidInput("Unexpected token type".to_string())),
11751232
}
1176-
_ => Err(self.parse_error_with_context(format!(
1177-
"Expected primitive value, found {:?}",
1178-
self.current_token
1179-
))),
11801233
}
11811234
}
11821235

@@ -1695,7 +1748,7 @@ hello: 0(f)"#;
16951748
// Issue #56: Array elements starting with a number should be parsed as string
16961749
// when followed by non-numeric text
16971750
let result = parse("version1[1]: 1.0 something").unwrap();
1698-
assert_eq!(result["version1"], json!(["1 something"]));
1751+
assert_eq!(result["version1"], json!(["1.0 something"]));
16991752

17001753
let result = parse("data[1]: 42 units").unwrap();
17011754
assert_eq!(result["data"], json!(["42 units"]));
@@ -1707,4 +1760,64 @@ hello: 0(f)"#;
17071760
let result = parse("nums[1]: 2.75").unwrap();
17081761
assert_eq!(result["nums"], json!([2.75]));
17091762
}
1763+
1764+
#[test]
1765+
fn test_issue_59_multiple_spaces_preserved() {
1766+
// Issue #59: Multiple spaces between words should be preserved
1767+
// Field value context
1768+
let result = parse("key: a b").unwrap();
1769+
assert_eq!(result["key"], json!("a b"));
1770+
1771+
// Tabular cell context
1772+
let result = parse("data[2]: a b, c d").unwrap();
1773+
assert_eq!(result["data"], json!(["a b", "c d"]));
1774+
1775+
// Root-level value
1776+
let result = parse("a b").unwrap();
1777+
assert_eq!(result, json!("a b"));
1778+
}
1779+
1780+
#[test]
1781+
fn test_issue_60_mixed_type_tokens_as_string() {
1782+
// Issue #60: "1 null" and "a 1" should parse as strings in tabular rows
1783+
// Tabular cell context
1784+
let result = parse("data[2]: 1 null, a 1").unwrap();
1785+
assert_eq!(result["data"], json!(["1 null", "a 1"]));
1786+
1787+
// Root-level value
1788+
let result = parse("1 null").unwrap();
1789+
assert_eq!(result, json!("1 null"));
1790+
1791+
let result = parse("a 1").unwrap();
1792+
assert_eq!(result, json!("a 1"));
1793+
1794+
// Field value context
1795+
let result = parse("key: 1 null").unwrap();
1796+
assert_eq!(result["key"], json!("1 null"));
1797+
1798+
let result = parse("key: a 1").unwrap();
1799+
assert_eq!(result["key"], json!("a 1"));
1800+
}
1801+
1802+
#[test]
1803+
fn test_issue_61_number_format_preserved() {
1804+
// Issue #61: "1.0 b" should preserve "1.0", not become "1 b"
1805+
// Tabular cell context
1806+
let result = parse("data[2]: 1.0 b, 1e1 b").unwrap();
1807+
assert_eq!(result["data"], json!(["1.0 b", "1e1 b"]));
1808+
1809+
// Field value context
1810+
let result = parse("key: 1.0 b").unwrap();
1811+
assert_eq!(result["key"], json!("1.0 b"));
1812+
1813+
let result = parse("key: 1e1 b").unwrap();
1814+
assert_eq!(result["key"], json!("1e1 b"));
1815+
1816+
// Root-level value
1817+
let result = parse("1.0 b").unwrap();
1818+
assert_eq!(result, json!("1.0 b"));
1819+
1820+
let result = parse("1e1 b").unwrap();
1821+
assert_eq!(result, json!("1e1 b"));
1822+
}
17101823
}

0 commit comments

Comments
 (0)