Handle tabs in input before lexing

emilienlemaire · emilienlemaire · commit 29c567c26f33 · 2026-03-06T10:51:51.000+01:00
diff --git a/src/lsp/cobol_preproc/src_lexer.mll b/src/lsp/cobol_preproc/src_lexer.mll
@@ -174,29 +174,18 @@
 
 }
 
-let tab = '\t'
 let newline = '\r'* '\n'
 let nnl = _ # ['\r' '\n']                             (* anything but newline *)
-let sna = nnl nnl nnl nnl nnl nnl              (* 6 chars; TODO: exclude tabs *)
-let tabs =
-  (tab |
-   nnl tab |
-   nnl nnl tab |
-   nnl nnl nnl tab |
-   nnl nnl nnl nnl tab |
-   nnl nnl nnl nnl nnl tab |
-   nnl nnl nnl nnl nnl nnl tab |
-   nnl nnl nnl nnl nnl nnl nnl tab)
-let spaces = ([' ' '\t']*)
+let sna = nnl nnl nnl nnl nnl nnl                     (* 6 chars *)
+let spaces = ' '*
 let    blank        = [' ' '\009' '\r']
 let nonblank        = nnl # blank
-let    blanks       =(blank | '\t')+
-let    blank_area_A = blank blank blank blanks | '\t'
+let    blanks       = blank+
+let    blank_area_A = blank blank blank blanks
 let nonblank_area_A =(nonblank nnl nnl nnl |
                       blank nonblank nnl nnl |
                       blank blank nonblank nnl |
                       blank blank blank nonblank)
-let nonblank = nonblank # ['\t']    (* now, also exclude tab from blank chars *)
 let separator = [ ',' ';' ]
 let epsilon = ""
 let letter = [ 'a'-'z' 'A'-'Z' ]                      (* TODO: '\128'-'\255'? *)
@@ -265,10 +254,6 @@ rule fixed_line state
       {
         fixed_indicator (Src_lexing.sna state lexbuf) lexbuf
       }
-  | tabs
-      {
-        fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
-      }
   | (nnl* newline)                                  (* blank line (too short) *)
       {
         Src_lexing.new_line (Src_lexing.sna state lexbuf) lexbuf
@@ -279,7 +264,7 @@ rule fixed_line state
       }
 and fixed_indicator state
   = parse
-  | ' ' | '\t' (* second tab *)                                    (* nominal *)
+  | ' '
       {
         fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
       }
@@ -370,7 +355,7 @@ and xopen_or_crt_or_acutrm_followup state
       }
 and cobolx_line state                                 (* COBOLX format (GCOS) *)
   = parse
-  | [' ' '\t']                                                     (* nominal *)
+  | ' '                                                 (* nominal *)
       {
         fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
       }
@@ -603,7 +588,7 @@ and fixed_continue_quoted_ebcdics state
 
 and free_line state
   = parse
-  | blanks | '\t'
+  | blanks
       {
         free_line state lexbuf
       }
diff --git a/src/lsp/cobol_preproc/src_lexing.ml b/src/lsp/cobol_preproc/src_lexing.ml
@@ -20,6 +20,27 @@ open Src_format
 
 (* --- *)
 
+let expand_tabs ?(tab_stop=8) ?(starting_col=0) src =
+  match String.index_opt src '\t' with
+  | None -> src
+  | Some _ ->
+      let buf = Buffer.create (String.length src) in
+      let col = ref starting_col in
+      let spaces = String.make tab_stop ' ' in
+      String.iter (function
+        | '\t' ->
+            let n = (tab_stop - !col mod tab_stop) in
+            Buffer.add_substring buf spaces 0 n;
+            col := !col + 1
+        | ('\n' | '\r') as c ->
+            Buffer.add_char buf c;
+            col := 0;
+        | c ->
+            Buffer.add_char buf c;
+            incr col)
+      src;
+      Buffer.contents buf
+
 let remove_blanks = Str.global_replace (Str.regexp " ") ""           (* '\t'? *)
 
 (* --- *)
diff --git a/src/lsp/cobol_preproc/src_lexing.mli b/src/lsp/cobol_preproc/src_lexing.mli
@@ -11,6 +11,10 @@
 (*                                                                        *)
 (**************************************************************************)
 
+val expand_tabs: ?tab_stop:int -> ?starting_col:int -> string -> string
+
+(** -- *)
+
 type 'k state
 
 val init_state: 'k Src_format.source_format -> 'k state
diff --git a/src/lsp/cobol_preproc/src_reader.ml b/src/lsp/cobol_preproc/src_reader.ml
@@ -246,10 +246,54 @@ let make make_lexing ?filename ~source_format input =
   Option.iter (Lexing.set_filename lexbuf) filename;
   Plx (Src_lexing.init_state source_format, lexbuf)
 
+let from_channel_expanding_tabs ?with_positions ?(tab_stop = 8) ic : Lexing.lexbuf =
+    let read_buf = Bytes.create 4096 in
+    let read_pos = ref 0 in    (* current position in read_buf *)
+    let read_len = ref 0 in    (* valid bytes in read_buf *)
+    let col = ref 0 in         (* current column (0-based) *)
+    let refill buf len =
+      let written = ref 0 in
+      let rec loop () =
+        if !written >= len then ()       (* lexer buffer full *)
+        else begin
+          (* Refill read_buf if exhausted *)
+          if !read_pos >= !read_len then begin
+            let n = input ic read_buf 0 (Bytes.length read_buf) in
+            if n = 0 then ()             (* EOF *)
+            else begin read_pos := 0; read_len := n; loop () end
+          end else
+            let c = Bytes.get read_buf !read_pos in
+            if c = '\t' then begin
+              let spaces = tab_stop - (!col mod tab_stop) in
+              let n = min spaces (len - !written) in
+              Bytes.fill buf !written n ' ';
+              written := !written + n;
+              col := !col + n;
+              if n = spaces then           (* fully expanded this tab *)
+                incr read_pos
+              (* else: partially expanded; we'll resume next refill *)
+            end else begin
+              Bytes.set buf !written c;
+              incr written;
+              incr read_pos;
+              if c = '\n' then col := 0
+              else col := !col + 1
+            end;
+            loop ()
+        end
+      in
+      loop ();
+      !written
+    in
+    Lexing.from_function ?with_positions refill
+
 (* --- *)
 
 let from_string = make Lexing.from_string
 let from_channel = make Lexing.from_channel
+let from_channel_no_tabs ?(tab_stop=8) =
+    make (from_channel_expanding_tabs ~tab_stop)
+
 
 let fill buff ~lookup_len (input: Src_input.t) =
   match input with
@@ -276,9 +320,9 @@ let from ?source_format (input: Src_input.t) =
   let source_format, input = start_reading input ?source_format in
   match input with
   | String { contents; filename } ->
-      from_string ~source_format ~filename contents
+      from_string ~source_format ~filename (Src_lexing.expand_tabs contents)
   | Channel { ic; filename } ->
-      from_channel ~source_format ~filename ic
+      from_channel_no_tabs ~source_format ~filename ic
 
 (* --- *)
 
diff --git a/test/cobol_parsing/tokens.ml b/test/cobol_parsing/tokens.ml
@@ -146,3 +146,54 @@ let%expect_test "token-locations-with-missing-program-id" =
       WORD[x]@<prog.cob:11-20|11-21>
       .@<prog.cob:11-21|11-22>
       EOF@<prog.cob:11-22|11-22> |}];;
+
+let%expect_test "tokens-with-tabs" =
+  Parser_testing.show_parsed_tokens ~source_format:(SF SFFixed)
+    ~parser_options:(Parser_testing.options ~verbose:true ())
+    {|
+       IDENTIFICATION DIVISION.
+       PROGRAM-ID. prog.
+       PROCEDURE DIVISION.
+      		STRING 	W-AGT ";"   W-RUBNUM (J) ";" 
+      		W-RENVOINOTE W-DEST-NOM  ";" 
+      		W-DEST-RUE1  ";"  W-DEST-RUE2  ";" 
+      		W-DEST-CP ";"  W-DEST-VILLE 
+      		";" W-DEST-TEL1 (1:2)  " "
+                   W-DEST-TEL1 (3:2)  " "
+                   W-DEST-TEL1 (5:2)  " "
+                   W-DEST-TEL1 (7:2)  " "
+                   W-DEST-TEL1 (9:2)
+             delimited  by  "   "  into  LARTISAN.
+|};
+    [%expect {|
+      Tks: IDENTIFICATION, DIVISION, .
+      Tks: PROGRAM-ID, ., INFO_WORD[prog], .
+      Incoming: {RECURSIVE}
+      Tks': ., INFO_WORD[prog], .
+      Tks: PROCEDURE, DIVISION, .
+      Outgoing: {RECURSIVE}
+      Tks':
+      Tks: STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";"
+      Tks: WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";"
+      Tks: WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";"
+      Tks: WORD_IN_AREA_A[W-DEST-CP], ";"
+      Tks: WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1], (, DIGITS[1], :, DIGITS[2],
+           ), " "
+      Tks: WORD[W-DEST-TEL1], (, DIGITS[3], :, DIGITS[2], ), " "
+      Tks: WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " "
+      Tks: WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " "
+      Tks: WORD[W-DEST-TEL1]
+      Tks: (, DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, "   ", INTO,
+           WORD[LARTISAN], .
+      Tks: EOF
+      Tks':
+      IDENTIFICATION, DIVISION, ., PROGRAM-ID, ., INFO_WORD[prog], ., PROCEDURE,
+      DIVISION, ., STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";",
+      WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";",
+      WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";",
+      WORD_IN_AREA_A[W-DEST-CP], ";", WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1],
+      (, DIGITS[1], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[3], :,
+      DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " ",
+      WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (,
+      DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, "   ", INTO, WORD[LARTISAN], .,
+      EOF |}];;
diff --git a/test/output-tests/used_binaries.expected b/test/output-tests/used_binaries.expected
@@ -48,12 +48,12 @@ Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:946:0
 Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:962:0
 Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:991:0
 Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:1024:0
-used_binaries.at-1024-progprep.cob:5.36-5.37:
+used_binaries.at-1024-progprep.cob:5.47-5.48:
    2          IDENTIFICATION   DIVISION.
    3          PROGRAM-ID.      prog.
    4          DATA             DIVISION.
    5 >        WORKING-STORAGE  SECTION.	  	#
-----                                       ^
+----                                        
    6          01 TEST-VAR PIC 9(2) VALUE 'A'.
    7          COPY 'CRUD.CPY'.
 >> Error: Invalid syntax