|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
| 3 | +require "json" |
| 4 | + |
3 | 5 | module DiscourseAi |
4 | 6 | module Utils |
5 | 7 | class BestEffortJsonParser |
6 | | - def self.extract_key(helper_response, schema_type, schema_key) |
7 | | - schema_type = schema_type.to_sym |
8 | | - schema_key = schema_key.to_sym |
| 8 | + class << self |
| 9 | + def extract_key(helper_response, schema_type, schema_key) |
| 10 | + return helper_response unless helper_response.is_a?(String) |
9 | 11 |
|
10 | | - return helper_response unless helper_response.is_a?(String) |
| 12 | + schema_type = schema_type.to_sym |
| 13 | + schema_key = schema_key&.to_sym |
| 14 | + cleaned = remove_markdown_fences(helper_response.strip) |
11 | 15 |
|
12 | | - # First attempt: try to parse after removing markdown fences |
13 | | - cleaned = helper_response.strip |
| 16 | + parsed = |
| 17 | + try_parse(cleaned) || try_parse(fix_common_issues(cleaned)) || |
| 18 | + manual_extract(cleaned, schema_key, schema_type) |
14 | 19 |
|
15 | | - # Remove markdown code fences |
16 | | - if cleaned.match?(/^```(?:json)?\s*\n/i) |
17 | | - cleaned = cleaned.gsub(/^```(?:json)?\s*\n/i, "").gsub(/\n```\s*$/, "") |
| 20 | + value = parsed.is_a?(Hash) ? parsed[schema_key.to_s] : parsed |
| 21 | + parsed = cast_value(value, schema_type) |
18 | 22 | end |
19 | 23 |
|
20 | | - # Try standard JSON parse |
21 | | - begin |
22 | | - parsed = JSON.parse(cleaned) |
23 | | - return extract_value(parsed, schema_key, schema_type) |
24 | | - rescue JSON::ParserError |
25 | | - # Continue to next attempt |
| 24 | + private |
| 25 | + |
| 26 | + def remove_markdown_fences(text) |
| 27 | + return text unless text.match?(/^```(?:json)?\s*\n/i) |
| 28 | + |
| 29 | + text.gsub(/^```(?:json)?\s*\n/i, "").gsub(/\n```\s*$/, "") |
26 | 30 | end |
27 | 31 |
|
28 | | - # Second attempt: fix common JSON issues |
29 | | - fixed_json = |
30 | | - cleaned.gsub(/(\w+):/, '"\1":') # Fix unquoted keys |
31 | | - .gsub(/'/, '\"') # Replace single quotes with double quotes |
| 32 | + def fix_common_issues(text) |
| 33 | + text.gsub(/(\w+):/, '"\1":').gsub(/'/, "\"") |
| 34 | + end |
32 | 35 |
|
33 | | - begin |
34 | | - parsed = JSON.parse(fixed_json) |
35 | | - return extract_value(parsed, schema_key, schema_type) |
| 36 | + def try_parse(text) |
| 37 | + JSON.parse(text) |
36 | 38 | rescue JSON::ParserError |
37 | | - # Continue to manual extraction |
| 39 | + nil |
38 | 40 | end |
39 | 41 |
|
40 | | - # Third attempt: manual extraction based on key |
41 | | - if schema_key |
42 | | - key_str = schema_key.to_s |
43 | | - |
44 | | - # Look for the key in various formats |
45 | | - patterns = [ |
46 | | - /"#{key_str}"\s*:\s*"([^"]+)"/, # "key": "value" |
47 | | - /'#{key_str}'\s*:\s*'([^']+)'/, # 'key': 'value' |
48 | | - /#{key_str}\s*:\s*"([^"]+)"/, # key: "value" |
49 | | - /#{key_str}\s*:\s*'([^']+)'/, # key: 'value' |
50 | | - /"#{key_str}"\s*:\s*\[([^\]]+)\]/, # "key": [array] |
51 | | - /'#{key_str}'\s*:\s*\[([^\]]+)\]/, # 'key': [array] |
52 | | - /#{key_str}\s*:\s*\[([^\]]+)\]/, # key: [array] |
53 | | - ] |
54 | | - |
55 | | - # For objects, handle separately to deal with nesting |
56 | | - object_patterns = [ |
57 | | - /"#{key_str}"\s*:\s*\{/, # "key": { |
58 | | - /'#{key_str}'\s*:\s*\{/, # 'key': { |
59 | | - /#{key_str}\s*:\s*\{/, # key: { |
60 | | - ] |
61 | | - |
62 | | - # Try string/array patterns first |
63 | | - patterns.each do |pattern| |
64 | | - if match = helper_response.match(pattern) |
65 | | - value = match[1] |
66 | | - |
67 | | - case schema_type |
68 | | - when :string |
69 | | - return value |
70 | | - when :array |
71 | | - begin |
72 | | - return JSON.parse("[#{value}]") |
73 | | - rescue StandardError |
74 | | - # Try to split by comma and clean up |
75 | | - items = value.split(",").map { |item| item.strip.gsub(/^['"]|['"]$/, "") } |
76 | | - return items |
77 | | - end |
78 | | - end |
79 | | - end |
| 42 | + def manual_extract(text, key, schema_type) |
| 43 | + return default_for(schema_type) unless key |
| 44 | + |
| 45 | + case schema_type |
| 46 | + when :object |
| 47 | + extract_object(text, key.to_s) |
| 48 | + when :array, :string |
| 49 | + extract_scalar(text, key.to_s, schema_type) |
| 50 | + else |
| 51 | + default_for(schema_type) |
80 | 52 | end |
| 53 | + end |
81 | 54 |
|
82 | | - # Try object patterns |
83 | | - if schema_type == :object |
84 | | - object_patterns.each do |pattern| |
85 | | - if match = helper_response.match(pattern) |
86 | | - # Find the starting brace position after the key |
87 | | - start_pos = match.end(0) - 1 # Position of the opening brace |
88 | | - if start_pos >= 0 && helper_response[start_pos] == "{" |
89 | | - # Extract the full object by counting braces |
90 | | - brace_count = 0 |
91 | | - end_pos = start_pos |
92 | | - |
93 | | - helper_response[start_pos..-1].each_char.with_index do |char, idx| |
94 | | - if char == "{" |
95 | | - brace_count += 1 |
96 | | - elsif char == "}" |
97 | | - brace_count -= 1 |
98 | | - if brace_count == 0 |
99 | | - end_pos = start_pos + idx |
100 | | - break |
101 | | - end |
102 | | - end |
103 | | - end |
104 | | - |
105 | | - if brace_count == 0 |
106 | | - object_str = helper_response[start_pos..end_pos] |
107 | | - begin |
108 | | - return JSON.parse(object_str) |
109 | | - rescue StandardError |
110 | | - # Try to fix and parse |
111 | | - fixed = object_str.gsub(/(\w+):/, '"\1":').gsub(/'/, '"') |
112 | | - begin |
113 | | - return JSON.parse(fixed) |
114 | | - rescue StandardError |
115 | | - return {} |
116 | | - end |
117 | | - end |
118 | | - end |
119 | | - end |
120 | | - end |
| 55 | + def extract_scalar(text, key, schema_type) |
| 56 | + patterns = |
| 57 | + if schema_type == :array |
| 58 | + [ |
| 59 | + /"#{key}"\s*:\s*\[([^\]]+)\]/, |
| 60 | + /'#{key}'\s*:\s*\[([^\]]+)\]/, |
| 61 | + /#{key}\s*:\s*\[([^\]]+)\]/, |
| 62 | + ] |
| 63 | + else |
| 64 | + [ |
| 65 | + /"#{key}"\s*:\s*"([^"]+)"/, |
| 66 | + /'#{key}'\s*:\s*'([^']+)'/, |
| 67 | + /#{key}\s*:\s*"([^"]+)"/, |
| 68 | + /#{key}\s*:\s*'([^']+)'/, |
| 69 | + ] |
121 | 70 | end |
| 71 | + |
| 72 | + patterns.each do |pattern| |
| 73 | + match = text.match(pattern) |
| 74 | + next unless match |
| 75 | + |
| 76 | + value = match[1] |
| 77 | + return schema_type == :array ? parse_array(value) : value |
122 | 78 | end |
| 79 | + |
| 80 | + default_for(schema_type) |
123 | 81 | end |
124 | 82 |
|
125 | | - case schema_type |
126 | | - when :array |
127 | | - [] |
128 | | - when :object |
129 | | - {} |
130 | | - else |
131 | | - "" |
| 83 | + def parse_array(value) |
| 84 | + JSON.parse("[#{value}]") |
| 85 | + rescue JSON::ParserError |
| 86 | + value.split(",").map { |item| item.strip.gsub(/^['"]|['"]$/, "") } |
| 87 | + end |
| 88 | + |
| 89 | + def extract_object(text, key) |
| 90 | + pattern = /("#{key}"|'#{key}'|#{key})\s*:\s*\{/ |
| 91 | + match = text.match(pattern) or return {} |
| 92 | + |
| 93 | + start = match.end(0) - 1 |
| 94 | + return {} unless text[start] == "{" |
| 95 | + |
| 96 | + end_pos = find_matching_brace(text, start) |
| 97 | + return {} unless end_pos |
| 98 | + |
| 99 | + obj_str = text[start..end_pos] |
| 100 | + try_parse(obj_str) || try_parse(fix_common_issues(obj_str)) || {} |
132 | 101 | end |
133 | | - end |
134 | 102 |
|
135 | | - def self.extract_value(parsed, schema_key, schema_type) |
136 | | - return parsed unless parsed.is_a?(Hash) && schema_key |
| 103 | + def find_matching_brace(text, start_pos) |
| 104 | + brace_count = 0 |
137 | 105 |
|
138 | | - value = parsed[schema_key.to_s] |
| 106 | + text[start_pos..-1].each_char.with_index do |char, idx| |
| 107 | + brace_count += 1 if char == "{" |
| 108 | + if char == "}" |
| 109 | + brace_count -= 1 |
| 110 | + return start_pos + idx if brace_count.zero? |
| 111 | + end |
| 112 | + end |
| 113 | + nil |
| 114 | + end |
| 115 | + |
| 116 | + def cast_value(value, schema_type) |
| 117 | + case schema_type |
| 118 | + when :array |
| 119 | + value.is_a?(Array) ? value : [] |
| 120 | + when :object |
| 121 | + value.is_a?(Hash) ? value : {} |
| 122 | + else |
| 123 | + value.to_s |
| 124 | + end |
| 125 | + end |
139 | 126 |
|
140 | | - case schema_type |
141 | | - when :array |
142 | | - value.is_a?(Array) ? value : [] |
143 | | - when :object |
144 | | - value.is_a?(Hash) ? value : {} |
145 | | - else |
146 | | - value.to_s |
| 127 | + def default_for(schema_type) |
| 128 | + schema_type == :array ? [] : schema_type == :object ? {} : "" |
147 | 129 | end |
148 | 130 | end |
149 | 131 | end |
|
0 commit comments