Skip to content

Commit e4efa73

Browse files
Handle tabs in input before lexing
1 parent 8886ebe commit e4efa73

File tree

6 files changed

+131
-26
lines changed

6 files changed

+131
-26
lines changed

src/lsp/cobol_preproc/src_lexer.mll

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -171,29 +171,18 @@
171171

172172
}
173173

174-
let tab = '\t'
175174
let newline = '\r'* '\n'
176175
let nnl = _ # ['\r' '\n'] (* anything but newline *)
177-
let sna = nnl nnl nnl nnl nnl nnl (* 6 chars; TODO: exclude tabs *)
178-
let tabs =
179-
(tab |
180-
nnl tab |
181-
nnl nnl tab |
182-
nnl nnl nnl tab |
183-
nnl nnl nnl nnl tab |
184-
nnl nnl nnl nnl nnl tab |
185-
nnl nnl nnl nnl nnl nnl tab |
186-
nnl nnl nnl nnl nnl nnl nnl tab)
187-
let spaces = ([' ' '\t']*)
176+
let sna = nnl nnl nnl nnl nnl nnl (* 6 chars *)
177+
let spaces = ' '*
188178
let blank = [' ' '\009' '\r']
189179
let nonblank = nnl # blank
190-
let blanks =(blank | '\t')+
191-
let blank_area_A = blank blank blank blanks | '\t'
180+
let blanks = blank+
181+
let blank_area_A = blank blank blank blanks
192182
let nonblank_area_A =(nonblank nnl nnl nnl |
193183
blank nonblank nnl nnl |
194184
blank blank nonblank nnl |
195185
blank blank blank nonblank)
196-
let nonblank = nonblank # ['\t'] (* now, also exclude tab from blank chars *)
197186
let separator = [ ',' ';' ]
198187
let epsilon = ""
199188
let letter = [ 'a'-'z' 'A'-'Z' ] (* TODO: '\128'-'\255'? *)
@@ -262,10 +251,6 @@ rule fixed_line state
262251
{
263252
fixed_indicator (Src_lexing.sna state lexbuf) lexbuf
264253
}
265-
| tabs
266-
{
267-
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
268-
}
269254
| (nnl* newline) (* blank line (too short) *)
270255
{
271256
Src_lexing.new_line (Src_lexing.sna state lexbuf) lexbuf
@@ -276,7 +261,7 @@ rule fixed_line state
276261
}
277262
and fixed_indicator state
278263
= parse
279-
| ' ' | '\t' (* second tab *) (* nominal *)
264+
| ' '
280265
{
281266
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
282267
}
@@ -367,7 +352,7 @@ and xopen_or_crt_or_acutrm_followup state
367352
}
368353
and cobolx_line state (* COBOLX format (GCOS) *)
369354
= parse
370-
| [' ' '\t'] (* nominal *)
355+
| ' ' (* nominal *)
371356
{
372357
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
373358
}
@@ -600,7 +585,7 @@ and fixed_continue_quoted_ebcdics state
600585

601586
and free_line state
602587
= parse
603-
| blanks | '\t'
588+
| blanks
604589
{
605590
free_line state lexbuf
606591
}

src/lsp/cobol_preproc/src_lexing.ml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,27 @@ open Src_format
2020

2121
(* --- *)
2222

23+
let expand_tabs ?(tab_stop=8) ?(starting_col=0) src =
24+
match String.index_opt src '\t' with
25+
| None -> src
26+
| Some _ ->
27+
let buf = Buffer.create (String.length src) in
28+
let col = ref starting_col in
29+
let spaces = String.make tab_stop ' ' in
30+
String.iter (function
31+
| '\t' ->
32+
let n = (tab_stop - !col mod tab_stop) in
33+
Buffer.add_substring buf spaces 0 n;
34+
col := !col + 1
35+
| ('\n' | '\r') as c ->
36+
Buffer.add_char buf c;
37+
col := 0;
38+
| c ->
39+
Buffer.add_char buf c;
40+
incr col)
41+
src;
42+
Buffer.contents buf
43+
2344
let remove_blanks = Str.global_replace (Str.regexp " ") "" (* '\t'? *)
2445

2546
(* --- *)

src/lsp/cobol_preproc/src_lexing.mli

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
(* *)
1212
(**************************************************************************)
1313

14+
val expand_tabs: ?tab_stop:int -> ?starting_col:int -> string -> string
15+
16+
(** -- *)
17+
1418
type 'k state
1519

1620
val init_state: 'k Src_format.source_format -> 'k state

src/lsp/cobol_preproc/src_reader.ml

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,10 +242,54 @@ let make make_lexing ?filename ~source_format input =
242242
Option.iter (Lexing.set_filename lexbuf) filename;
243243
Plx (Src_lexing.init_state source_format, lexbuf)
244244

245+
let from_channel_expanding_tabs ?with_positions ?(tab_stop = 8) ic : Lexing.lexbuf =
246+
let read_buf = Bytes.create 4096 in
247+
let read_pos = ref 0 in (* current position in read_buf *)
248+
let read_len = ref 0 in (* valid bytes in read_buf *)
249+
let col = ref 0 in (* current column (0-based) *)
250+
let refill buf len =
251+
let written = ref 0 in
252+
let rec loop () =
253+
if !written >= len then () (* lexer buffer full *)
254+
else begin
255+
(* Refill read_buf if exhausted *)
256+
if !read_pos >= !read_len then begin
257+
let n = input ic read_buf 0 (Bytes.length read_buf) in
258+
if n = 0 then () (* EOF *)
259+
else begin read_pos := 0; read_len := n; loop () end
260+
end else
261+
let c = Bytes.get read_buf !read_pos in
262+
if c = '\t' then begin
263+
let spaces = tab_stop - (!col mod tab_stop) in
264+
let n = min spaces (len - !written) in
265+
Bytes.fill buf !written n ' ';
266+
written := !written + n;
267+
col := !col + n;
268+
if n = spaces then (* fully expanded this tab *)
269+
incr read_pos
270+
(* else: partially expanded; we'll resume next refill *)
271+
end else begin
272+
Bytes.set buf !written c;
273+
incr written;
274+
incr read_pos;
275+
if c = '\n' then col := 0
276+
else col := !col + 1
277+
end;
278+
loop ()
279+
end
280+
in
281+
loop ();
282+
!written
283+
in
284+
Lexing.from_function ?with_positions refill
285+
245286
(* --- *)
246287

247288
let from_string = make Lexing.from_string
248289
let from_channel = make Lexing.from_channel
290+
let from_channel_no_tabs ?(tab_stop=8) =
291+
make (from_channel_expanding_tabs ~tab_stop)
292+
249293

250294
let fill buff ~lookup_len (input: Src_input.t) =
251295
match input with
@@ -272,9 +316,9 @@ let from ?source_format (input: Src_input.t) =
272316
let source_format, input = start_reading input ?source_format in
273317
match input with
274318
| String { contents; filename } ->
275-
from_string ~source_format ~filename contents
319+
from_string ~source_format ~filename (Src_lexing.expand_tabs contents)
276320
| Channel { ic; filename } ->
277-
from_channel ~source_format ~filename ic
321+
from_channel_no_tabs ~source_format ~filename ic
278322

279323
(* --- *)
280324

test/cobol_parsing/tokens.ml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,54 @@ let%expect_test "token-locations-with-missing-program-id" =
146146
WORD[x]@<prog.cob:11-20|11-21>
147147
.@<prog.cob:11-21|11-22>
148148
EOF@<prog.cob:11-22|11-22> |}];;
149+
150+
let%expect_test "tokens-with-tabs" =
151+
Parser_testing.show_parsed_tokens ~source_format:(SF SFFixed)
152+
~parser_options:(Parser_testing.options ~verbose:true ())
153+
{|
154+
IDENTIFICATION DIVISION.
155+
PROGRAM-ID. prog.
156+
PROCEDURE DIVISION.
157+
STRING W-AGT ";" W-RUBNUM (J) ";"
158+
W-RENVOINOTE W-DEST-NOM ";"
159+
W-DEST-RUE1 ";" W-DEST-RUE2 ";"
160+
W-DEST-CP ";" W-DEST-VILLE
161+
";" W-DEST-TEL1 (1:2) " "
162+
W-DEST-TEL1 (3:2) " "
163+
W-DEST-TEL1 (5:2) " "
164+
W-DEST-TEL1 (7:2) " "
165+
W-DEST-TEL1 (9:2)
166+
delimited by " " into LARTISAN.
167+
|};
168+
[%expect {|
169+
Tks: IDENTIFICATION, DIVISION, .
170+
Tks: PROGRAM-ID, ., INFO_WORD[prog], .
171+
Incoming: {RECURSIVE}
172+
Tks': ., INFO_WORD[prog], .
173+
Tks: PROCEDURE, DIVISION, .
174+
Outgoing: {RECURSIVE}
175+
Tks':
176+
Tks: STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";"
177+
Tks: WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";"
178+
Tks: WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";"
179+
Tks: WORD_IN_AREA_A[W-DEST-CP], ";"
180+
Tks: WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1], (, DIGITS[1], :, DIGITS[2],
181+
), " "
182+
Tks: WORD[W-DEST-TEL1], (, DIGITS[3], :, DIGITS[2], ), " "
183+
Tks: WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " "
184+
Tks: WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " "
185+
Tks: WORD[W-DEST-TEL1]
186+
Tks: (, DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, " ", INTO,
187+
WORD[LARTISAN], .
188+
Tks: EOF
189+
Tks':
190+
IDENTIFICATION, DIVISION, ., PROGRAM-ID, ., INFO_WORD[prog], ., PROCEDURE,
191+
DIVISION, ., STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";",
192+
WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";",
193+
WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";",
194+
WORD_IN_AREA_A[W-DEST-CP], ";", WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1],
195+
(, DIGITS[1], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[3], :,
196+
DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " ",
197+
WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (,
198+
DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, " ", INTO, WORD[LARTISAN], .,
199+
EOF |}];;

test/output-tests/used_binaries.expected

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:946:0
4848
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:962:0
4949
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:991:0
5050
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:1024:0
51-
used_binaries.at-1024-progprep.cob:5.36-5.37:
51+
used_binaries.at-1024-progprep.cob:5.47-5.48:
5252
2 IDENTIFICATION DIVISION.
5353
3 PROGRAM-ID. prog.
5454
4 DATA DIVISION.
5555
5 > WORKING-STORAGE SECTION. #
56-
---- ^
56+
----
5757
6 01 TEST-VAR PIC 9(2) VALUE 'A'.
5858
7 COPY 'CRUD.CPY'.
5959
>> Error: Invalid syntax

0 commit comments

Comments
 (0)