Skip to content

Commit 1c81f08

Browse files
author
nshkrdotcom
committed
feat: Implement advanced JSON repair patterns and pre-processing
This commit introduces a new pre-processing pipeline and several new repair modules to handle complex malformed JSON patterns, inspired by the json-repair Python library. New Pre-processing Pipeline: - A pre-processing stage has been added before the main layer pipeline to handle specific patterns that would otherwise be broken by subsequent layers. - Multiple JSON Detection: A new MultipleJsonDetector utility now runs first to detect and aggregate consecutive JSON values (e.g., []{} becomes [[], {}]). This prevents Layer 1 from treating subsequent JSON as "wrapper text". - Object Boundary Merging: The ObjectMerger now runs before Layer 1 to merge key-value pairs that appear after a premature closing brace (e.g., {"a":"b"},"c":"d"}). New Layer 3 Filters: Ellipsis Filter: A new module to remove unquoted ellipsis (...) placeholders, which are common in LLM-generated content. Keyword Filter: A new module to remove unquoted, comment-like keywords such as COMMENT, DEBUG_INFO, and PLACEHOLDER. Enhancements: - Layer 1 (ContentCleaning): The trailing wrapper text removal logic is now smarter, checking if the trailing content is another valid JSON object before removing it. - Tests: Previously skipped tests for patterns 1-4 (Multiple JSON, Object Merging, Ellipsis, and Keywords) have been enabled and are now passing. - Cleanup: Removed temporary test scripts (test_boolean.exs, test_weiss.exs).
1 parent e183e5a commit 1c81f08

14 files changed

+565
-116
lines changed

check_layer5_usage.exs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Test if Layer 5 is actually being called
2+
input = "[]{}"
3+
IO.puts("Input: #{input}")
4+
5+
result = JsonRemedy.repair(input, logging: true)
6+
7+
case result do
8+
{:ok, data, repairs} ->
9+
IO.puts("Result: #{inspect(data)}")
10+
IO.puts("\nRepairs:")
11+
12+
Enum.each(repairs, fn r ->
13+
IO.puts(" - Layer: #{r.layer}")
14+
end)
15+
16+
layer5_used = Enum.any?(repairs, fn r -> r.layer == :tolerant_parsing end)
17+
IO.puts("\nLayer 5 used? #{layer5_used}")
18+
19+
_ ->
20+
IO.puts("Failed")
21+
end

lib/json_remedy.ex

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ defmodule JsonRemedy do
77
88
This module provides the main API for JSON repair functionality. It supports
99
multiple repair strategies and can handle various types of malformed JSON through
10-
a four-layer processing pipeline:
10+
a four-layer processing pipeline with intelligent preprocessing:
1111
12+
- **Preprocessing**: Multiple JSON detection (Pattern 1)
1213
- **Layer 1**: Content Cleaning (removes code fences, comments, extra text)
1314
- **Layer 2**: Structural Repair (fixes missing braces, brackets, etc.)
1415
- **Layer 3**: Syntax Normalization (quotes, booleans, commas, colons)
@@ -33,6 +34,7 @@ defmodule JsonRemedy do
3334
alias JsonRemedy.Layer2.StructuralRepair
3435
alias JsonRemedy.Layer3.SyntaxNormalization
3536
alias JsonRemedy.Layer4.Validation
37+
alias JsonRemedy.Utils.MultipleJsonDetector
3638

3739
# Type definitions
3840
@type json_value ::
@@ -321,14 +323,50 @@ defmodule JsonRemedy do
321323
# Private implementation functions
322324

323325
defp process_through_pipeline(input, context, logging) do
326+
# Pre-processing: Detect multiple JSON values (Pattern 1)
327+
# Must run BEFORE layers because:
328+
# - Layer 1 may remove additional JSON as "wrapper text"
329+
# - Layer 3 adds commas between ]{ which breaks the pattern
330+
enable_multiple_json =
331+
Application.get_env(:json_remedy, :enable_multiple_json_aggregation, true)
332+
333+
if enable_multiple_json do
334+
case MultipleJsonDetector.parse_multiple(input) do
335+
{:ok, result} when is_list(result) and length(result) > 1 ->
336+
# Multiple values detected
337+
if logging do
338+
{:ok, result, context.repairs}
339+
else
340+
{:ok, result}
341+
end
342+
343+
_ ->
344+
# Single value, continue normal pipeline
345+
process_normal_pipeline(input, context, logging)
346+
end
347+
else
348+
process_normal_pipeline(input, context, logging)
349+
end
350+
end
351+
352+
defp process_normal_pipeline(input, context, logging) do
353+
# Pre-processing: Object boundary merging (before Layer 1 to prevent wrapper text removal)
354+
{input_after_merge, _merge_repairs} =
355+
if Application.get_env(:json_remedy, :enable_object_merging, true) do
356+
JsonRemedy.Layer3.ObjectMerger.merge_object_boundaries(input)
357+
else
358+
{input, []}
359+
end
360+
324361
# Layer 1: Content Cleaning
325-
with {:ok, output1, context1} <- ContentCleaning.process(input, context),
362+
with {:ok, output1, context1} <- ContentCleaning.process(input_after_merge, context),
326363
# Layer 2: Structural Repair
327364
{:ok, output2, context2} <- StructuralRepair.process(output1, context1),
328365
# Layer 3: Syntax Normalization
329366
{:ok, output3, context3} <- SyntaxNormalization.process(output2, context2),
330367
# Layer 4: Validation
331368
{:ok, parsed, final_context} <- Validation.process(output3, context3) do
369+
# Pipeline succeeded
332370
if logging do
333371
{:ok, parsed, final_context.repairs}
334372
else

lib/json_remedy/layer1/content_cleaning.ex

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,15 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
685685
find_balanced_end(rest, open, close, pos + 1, balance, in_string)
686686
end
687687

688+
# Check if a string is valid JSON (not just starts with valid char)
689+
defp is_valid_json_start?(str) do
690+
first_char = String.at(str, 0)
691+
692+
# Must start with JSON structure delimiter, not primitives
693+
# This prevents "1 Volume(s) created" from being preserved
694+
first_char in ["{", "["]
695+
end
696+
688697
# Remove trailing wrapper text after JSON
689698
defp remove_trailing_wrapper_text(input) do
690699
trimmed = String.trim(input)
@@ -724,23 +733,31 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
724733

725734
# Check if there's non-whitespace content after JSON ends
726735
after_json = String.slice(input, json_end, String.length(input))
736+
after_json_trimmed = String.trim(after_json)
727737

728-
if String.trim(after_json) == "" do
729-
# No significant trailing content
730-
{input, []}
731-
else
732-
# Extract only the JSON portion
733-
json_content = String.slice(input, 0, json_end)
738+
cond do
739+
after_json_trimmed == "" ->
740+
# No significant trailing content
741+
{input, []}
734742

735-
repair = %{
736-
layer: :content_cleaning,
737-
action: "removed trailing wrapper text",
738-
position: json_end,
739-
original: input,
740-
replacement: json_content
741-
}
743+
is_valid_json_start?(after_json_trimmed) ->
744+
# The "trailing" content is actually another JSON value (Pattern 1: Multiple JSON)
745+
# Don't remove it - let it be handled by Layer 5
746+
{input, []}
747+
748+
true ->
749+
# Extract only the JSON portion, remove wrapper text
750+
json_content = String.slice(input, 0, json_end)
751+
752+
repair = %{
753+
layer: :content_cleaning,
754+
action: "removed trailing wrapper text",
755+
position: json_end,
756+
original: input,
757+
replacement: json_content
758+
}
742759

743-
{json_content, [repair]}
760+
{json_content, [repair]}
744761
end
745762
end
746763
end
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
defmodule JsonRemedy.Layer3.EllipsisFilter do
2+
@moduledoc """
3+
Filters unquoted ellipsis ("...") placeholders from JSON content.
4+
5+
LLMs and humans often use "..." as a placeholder to indicate truncated or
6+
omitted content. This module detects and removes unquoted ellipsis while
7+
preserving quoted "..." as valid string values.
8+
9+
Based on json_repair Python library (parse_array.py:34-37)
10+
"""
11+
12+
alias JsonRemedy.Layer3.SyntaxHelpers
13+
14+
@doc """
15+
Remove unquoted ellipsis patterns from JSON content.
16+
Returns {filtered_content, repairs}.
17+
"""
18+
@spec filter_ellipsis(String.t() | nil) :: {String.t(), list()}
19+
def filter_ellipsis(nil), do: {"", []}
20+
def filter_ellipsis(input) when not is_binary(input), do: {inspect(input), []}
21+
22+
def filter_ellipsis(content) when is_binary(content) do
23+
# Match unquoted ... (not inside quotes)
24+
# Apply multiple passes to handle all variations
25+
patterns = [
26+
# Only ellipsis: [...] → []
27+
{~r/\[\s*\.\.\.\s*\]/, "[]"},
28+
# Trailing ellipsis: [1, 2, ...] → [1, 2]
29+
{~r/,\s*\.\.\.\s*\]/, "]"},
30+
# Leading ellipsis: [..., 1, 2] → [1, 2]
31+
{~r/\[\s*\.\.\.\s*,/, "["},
32+
# Middle ellipsis: [1, ..., 3] → [1, 3]
33+
{~r/,\s*\.\.\.\s*,/, ","},
34+
# Before closing brace/bracket: ..., } or ..., ]
35+
{~r/,\s*\.\.\.\s*([}\]])/, "\\1"}
36+
]
37+
38+
{result, repairs} =
39+
Enum.reduce(patterns, {content, []}, fn {pattern, replacement},
40+
{acc_content, acc_repairs} ->
41+
# Keep applying pattern until no more matches (handles multiple occurrences)
42+
{final_content, match_count} =
43+
apply_pattern_recursively(pattern, replacement, acc_content, 0)
44+
45+
if match_count > 0 do
46+
repair =
47+
SyntaxHelpers.create_repair(
48+
"filtered ellipsis placeholder",
49+
"Removed #{match_count} unquoted ... placeholder(s)",
50+
0
51+
)
52+
53+
{final_content, [repair | acc_repairs]}
54+
else
55+
{acc_content, acc_repairs}
56+
end
57+
end)
58+
59+
{result, Enum.reverse(repairs)}
60+
end
61+
62+
# Apply pattern recursively until no more matches
63+
defp apply_pattern_recursively(pattern, replacement, content, count) do
64+
if Regex.match?(pattern, content) do
65+
new_content = Regex.replace(pattern, content, replacement, global: false)
66+
apply_pattern_recursively(pattern, replacement, new_content, count + 1)
67+
else
68+
{content, count}
69+
end
70+
end
71+
end
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
defmodule JsonRemedy.Layer3.KeywordFilter do
2+
@moduledoc """
3+
Filters comment-like keywords from JSON content.
4+
5+
LLMs and debug outputs sometimes include placeholder keywords like COMMENT,
6+
SHOULD_NOT_EXIST, DEBUG_INFO, etc. This module detects and removes them.
7+
8+
Based on json_repair Python library (parse_string.py:450-456)
9+
"""
10+
11+
alias JsonRemedy.Layer3.SyntaxHelpers
12+
13+
# Common keywords that should be filtered
14+
@filter_keywords ~w(
15+
COMMENT
16+
SHOULD_NOT_EXIST
17+
DEBUG_INFO
18+
DEBUG
19+
TRACE_END
20+
PLACEHOLDER
21+
SEPARATOR
22+
MARKER
23+
HEADER
24+
FOOTER
25+
INVALID
26+
TODO
27+
FIXME
28+
NOTE
29+
)
30+
31+
@doc """
32+
Remove comment-like keywords from JSON content.
33+
Returns {filtered_content, repairs}.
34+
"""
35+
@spec filter_keywords(String.t()) :: {String.t(), list()}
36+
def filter_keywords(content) when is_binary(content) do
37+
# Match patterns like: ", KEYWORD " or "{ KEYWORD " before next key
38+
# We need to be careful to only match unquoted keywords
39+
40+
{result, repairs} =
41+
Enum.reduce(@filter_keywords, {content, []}, fn keyword, {acc_content, acc_repairs} ->
42+
# Pattern: keyword followed by a string delimiter (indicating next key/value)
43+
# In objects: , KEYWORD "key" or { KEYWORD "key"
44+
# In arrays: , KEYWORD value
45+
patterns = [
46+
# Between object pairs: , KEYWORD "
47+
{~r/,\s+#{keyword}\s+"/i, ", \""},
48+
# At start of object: { KEYWORD "
49+
{~r/\{\s*#{keyword}\s+"/i, "{\""},
50+
# In array with following value: , KEYWORD <value>
51+
# This handles "PLACEHOLDER 3" -> "3"
52+
{~r/,\s+#{keyword}\s+/i, ", "},
53+
# At start of array: [ KEYWORD <value>
54+
{~r/\[\s*#{keyword}\s+/i, "["},
55+
# At end before closing: KEYWORD ]
56+
{~r/\s+#{keyword}\s*\]/i, "]"},
57+
# At end before closing: KEYWORD }
58+
{~r/\s+#{keyword}\s*\}/i, "}"}
59+
]
60+
61+
Enum.reduce(patterns, {acc_content, acc_repairs}, fn {pattern, replacement},
62+
{content_acc, repairs_acc} ->
63+
if Regex.match?(pattern, content_acc) do
64+
new_content = Regex.replace(pattern, content_acc, replacement)
65+
66+
repair =
67+
SyntaxHelpers.create_repair(
68+
"filtered comment keyword",
69+
"Removed #{keyword} placeholder",
70+
0
71+
)
72+
73+
{new_content, [repair | repairs_acc]}
74+
else
75+
{content_acc, repairs_acc}
76+
end
77+
end)
78+
end)
79+
80+
{result, Enum.reverse(repairs)}
81+
end
82+
83+
def filter_keywords(nil), do: {"", []}
84+
def filter_keywords(input) when not is_binary(input), do: {inspect(input), []}
85+
end
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
defmodule JsonRemedy.Layer3.ObjectMerger do
2+
@moduledoc """
3+
Merges additional key-value pairs that appear after object closing braces.
4+
5+
Pattern: {"a":"b"},"c":"d"} should become {"a":"b","c":"d"}
6+
7+
This happens when objects are malformed with extra closing braces or when
8+
additional pairs are erroneously placed outside the object.
9+
10+
Based on json_repair Python library (parse_object.py:123-143)
11+
"""
12+
13+
alias JsonRemedy.Layer3.SyntaxHelpers
14+
15+
@doc """
16+
Merge additional key-value pairs after object closes.
17+
Returns {merged_content, repairs}.
18+
"""
19+
@spec merge_object_boundaries(String.t() | nil) :: {String.t(), list()}
20+
def merge_object_boundaries(nil), do: {"", []}
21+
def merge_object_boundaries(input) when not is_binary(input), do: {inspect(input), []}
22+
23+
def merge_object_boundaries(content) when is_binary(content) do
24+
# Pattern: },"key":"value" → ,"key":"value"
25+
# This removes the premature closing brace before additional pairs
26+
# BUT only if there's an EXTRA closing brace (more } than {)
27+
28+
if should_merge?(content) do
29+
pattern = ~r/}\s*,\s*"/
30+
# Replace }, " with just , "
31+
result_after_replace = Regex.replace(pattern, content, fn _match -> ", \"" end)
32+
33+
# Then remove any extra trailing } at the end
34+
result = remove_extra_trailing_brace(result_after_replace)
35+
36+
repair =
37+
SyntaxHelpers.create_repair(
38+
"merged object boundary",
39+
"Merged additional key-value pairs into object",
40+
0
41+
)
42+
43+
{result, [repair]}
44+
else
45+
{content, []}
46+
end
47+
end
48+
49+
# Check if we should merge - only if there are MORE closing braces than opening
50+
defp should_merge?(content) do
51+
opening = content |> String.graphemes() |> Enum.count(&(&1 == "{"))
52+
closing = content |> String.graphemes() |> Enum.count(&(&1 == "}"))
53+
54+
# Only merge if there's an extra closing brace
55+
closing > opening
56+
end
57+
58+
# Remove extra trailing } that appears after merging
59+
defp remove_extra_trailing_brace(content) do
60+
# Count opening and closing braces to see if we have extra
61+
opening = content |> String.graphemes() |> Enum.count(&(&1 == "{"))
62+
closing = content |> String.graphemes() |> Enum.count(&(&1 == "}"))
63+
64+
if closing > opening do
65+
# Remove one } from the end
66+
content
67+
|> String.reverse()
68+
|> String.replace_prefix("}", "")
69+
|> String.reverse()
70+
else
71+
content
72+
end
73+
end
74+
end

0 commit comments

Comments
 (0)