@@ -11,24 +11,14 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
1111 Uses regex-based processing as it's the right tool for these content cleaning tasks.
1212 """
1313
14- @ type repair_action :: % {
15- layer: atom ( ) ,
16- action: String . t ( ) ,
17- position: non_neg_integer ( ) | nil ,
18- original: String . t ( ) | nil ,
19- replacement: String . t ( ) | nil
20- }
21-
22- @ type repair_context :: % {
23- repairs: [ repair_action ( ) ] ,
24- options: keyword ( ) ,
25- metadata: map ( )
26- }
27-
28- @ type layer_result ::
29- { :ok , String . t ( ) , repair_context ( ) }
30- | { :continue , String . t ( ) , repair_context ( ) }
31- | { :error , String . t ( ) }
14+ @ behaviour JsonRemedy.LayerBehaviour
15+
16+ alias JsonRemedy.LayerBehaviour
17+
18+ # Import types from LayerBehaviour
19+ @ type repair_action :: LayerBehaviour . repair_action ( )
20+ @ type repair_context :: LayerBehaviour . repair_context ( )
21+ @ type layer_result :: LayerBehaviour . layer_result ( )
3222
3323 @ doc """
3424 Process input string and apply Layer 1 content cleaning repairs.
@@ -44,8 +34,8 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
4434 input
4535 |> remove_code_fences ( )
4636 |> remove_comments ( )
47- |> extract_json_content ( )
48- |> normalize_encoding ( )
37+ |> extract_json_content_internal ( )
38+ |> normalize_encoding_internal ( )
4939
5040 updated_context = % {
5141 repairs: context . repairs ++ new_repairs ,
@@ -118,9 +108,9 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
118108 @ doc """
119109 Extract JSON from wrapper text (HTML, prose, etc.).
120110 """
121- @ spec extract_json_content ( input :: { String . t ( ) , [ repair_action ( ) ] } ) ::
111+ @ spec extract_json_content_internal ( input :: { String . t ( ) , [ repair_action ( ) ] } ) ::
122112 { String . t ( ) , [ repair_action ( ) ] }
123- def extract_json_content ( { input , existing_repairs } ) do
113+ def extract_json_content_internal ( { input , existing_repairs } ) do
124114 # Try to extract JSON from HTML tags first
125115 { result , html_repairs } = extract_from_html_tags ( input )
126116
@@ -134,9 +124,9 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
134124 @ doc """
135125 Normalize text encoding to UTF-8.
136126 """
137- @ spec normalize_encoding ( input :: { String . t ( ) , [ repair_action ( ) ] } ) ::
127+ @ spec normalize_encoding_internal ( input :: { String . t ( ) , [ repair_action ( ) ] } ) ::
138128 { String . t ( ) , [ repair_action ( ) ] }
139- def normalize_encoding ( { input , existing_repairs } ) do
129+ def normalize_encoding_internal ( { input , existing_repairs } ) do
140130 if String . valid? ( input ) do
141131 { input , existing_repairs }
142132 else
@@ -156,6 +146,113 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
156146 end
157147 end
158148
149+ # LayerBehaviour callback implementations
150+
151+ @ doc """
152+ Check if this layer can handle the given input.
153+ Layer 1 can handle any text input that may contain JSON with wrapping content.
154+ """
155+ @ spec supports? ( input :: String . t ( ) ) :: boolean ( )
156+ def supports? ( input ) when is_binary ( input ) do
157+ # Layer 1 can attempt to process any string input
158+ # It looks for code fences, comments, or wrapper content
159+ # Use fast string pattern matching instead of expensive operations
160+ String . contains? ( input , "```" ) or
161+ String . contains? ( input , "//" ) or
162+ String . contains? ( input , "/*" ) or
163+ String . contains? ( input , "<pre>" ) or
164+ String . contains? ( input , "<code>" ) or
165+ long_text_with_content? ( input )
166+ end
167+
168+ def supports? ( _ ) , do: false
169+
170+ @ doc """
171+ Return the priority order for this layer.
172+ Layer 1 (Content Cleaning) should run first in the pipeline.
173+ """
174+ @ spec priority ( ) :: non_neg_integer ( )
175+ def priority , do: 1
176+
177+ @ doc """
178+ Return a human-readable name for this layer.
179+ """
180+ @ spec name ( ) :: String . t ( )
181+ def name , do: "Content Cleaning"
182+
183+ @ doc """
184+ Validate layer configuration and options.
185+ Layer 1 accepts options for enabling/disabling specific cleaning features.
186+ """
187+ @ spec validate_options ( options :: keyword ( ) ) :: :ok | { :error , String . t ( ) }
188+ def validate_options ( options ) when is_list ( options ) do
189+ valid_keys = [ :remove_comments , :remove_code_fences , :extract_from_html , :normalize_encoding ]
190+
191+ invalid_keys = Keyword . keys ( options ) -- valid_keys
192+
193+ if Enum . empty? ( invalid_keys ) do
194+ # Validate option values
195+ case validate_option_values ( options ) do
196+ :ok -> :ok
197+ error -> error
198+ end
199+ else
200+ { :error , "Invalid options: #{ inspect ( invalid_keys ) } . Valid options: #{ inspect ( valid_keys ) } " }
201+ end
202+ end
203+
204+ def validate_options ( _ ) , do: { :error , "Options must be a keyword list" }
205+
206+ defp validate_option_values ( options ) do
207+ boolean_options = [
208+ :remove_comments ,
209+ :remove_code_fences ,
210+ :extract_from_html ,
211+ :normalize_encoding
212+ ]
213+
214+ Enum . reduce_while ( options , :ok , fn { key , value } , _acc ->
215+ if key in boolean_options and not is_boolean ( value ) do
216+ { :halt , { :error , "Option #{ key } must be a boolean, got: #{ inspect ( value ) } " } }
217+ else
218+ { :cont , :ok }
219+ end
220+ end )
221+ end
222+
223+ # Public API functions that match the API contracts
224+
225+ @ doc """
226+ Strip comments while preserving comment-like content in strings.
227+ Public API version that takes string input directly.
228+ """
229+ @ spec strip_comments ( input :: String . t ( ) ) :: { String . t ( ) , [ repair_action ( ) ] }
230+ def strip_comments ( input ) when is_binary ( input ) do
231+ remove_comments ( { input , [ ] } )
232+ end
233+
234+ @ doc """
235+ Extract JSON from wrapper text (HTML, prose, etc.).
236+ Public API version that takes string input directly.
237+ """
238+ @ spec extract_json_content ( input :: String . t ( ) ) :: { String . t ( ) , [ repair_action ( ) ] }
239+ def extract_json_content ( input ) when is_binary ( input ) do
240+ # Need to rename one of these functions to avoid conflicts
241+ # For now, call the internal pipeline function directly
242+ extract_json_content_internal ( { input , [ ] } )
243+ end
244+
245+ @ doc """
246+ Normalize text encoding to UTF-8.
247+ Public API version that takes string input directly.
248+ """
249+ @ spec normalize_encoding ( input :: String . t ( ) ) :: { String . t ( ) , [ repair_action ( ) ] }
250+ def normalize_encoding ( input ) when is_binary ( input ) do
251+ # Need to rename one of these functions to avoid conflicts
252+ # For now, call the internal pipeline function directly
253+ normalize_encoding_internal ( { input , [ ] } )
254+ end
255+
159256 # Private helper functions
160257
161258 defp remove_line_comments ( input ) do
@@ -312,6 +409,12 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
312409
313410 # Helper functions for string detection
314411
412+ # Fast check for long text that likely contains JSON content
413+ defp long_text_with_content? ( input ) do
414+ byte_size ( input ) > 100 and
415+ not ( String . starts_with? ( input , "{" ) or String . starts_with? ( input , "[" ) )
416+ end
417+
315418 defp inside_string? ( input , target ) when is_binary ( target ) do
316419 # Find the position of target in input
317420 case String . split ( input , target , parts: 2 ) do
0 commit comments