Skip to content

Commit 29c567c

Browse files
Handle tabs in input before lexing
1 parent b40beb7 commit 29c567c

File tree

6 files changed

+131
-26
lines changed

6 files changed

+131
-26
lines changed

src/lsp/cobol_preproc/src_lexer.mll

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -174,29 +174,18 @@
174174

175175
}
176176

177-
let tab = '\t'
178177
let newline = '\r'* '\n'
179178
let nnl = _ # ['\r' '\n'] (* anything but newline *)
180-
let sna = nnl nnl nnl nnl nnl nnl (* 6 chars; TODO: exclude tabs *)
181-
let tabs =
182-
(tab |
183-
nnl tab |
184-
nnl nnl tab |
185-
nnl nnl nnl tab |
186-
nnl nnl nnl nnl tab |
187-
nnl nnl nnl nnl nnl tab |
188-
nnl nnl nnl nnl nnl nnl tab |
189-
nnl nnl nnl nnl nnl nnl nnl tab)
190-
let spaces = ([' ' '\t']*)
179+
let sna = nnl nnl nnl nnl nnl nnl (* 6 chars *)
180+
let spaces = ' '*
191181
let blank = [' ' '\009' '\r']
192182
let nonblank = nnl # blank
193-
let blanks =(blank | '\t')+
194-
let blank_area_A = blank blank blank blanks | '\t'
183+
let blanks = blank+
184+
let blank_area_A = blank blank blank blanks
195185
let nonblank_area_A =(nonblank nnl nnl nnl |
196186
blank nonblank nnl nnl |
197187
blank blank nonblank nnl |
198188
blank blank blank nonblank)
199-
let nonblank = nonblank # ['\t'] (* now, also exclude tab from blank chars *)
200189
let separator = [ ',' ';' ]
201190
let epsilon = ""
202191
let letter = [ 'a'-'z' 'A'-'Z' ] (* TODO: '\128'-'\255'? *)
@@ -265,10 +254,6 @@ rule fixed_line state
265254
{
266255
fixed_indicator (Src_lexing.sna state lexbuf) lexbuf
267256
}
268-
| tabs
269-
{
270-
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
271-
}
272257
| (nnl* newline) (* blank line (too short) *)
273258
{
274259
Src_lexing.new_line (Src_lexing.sna state lexbuf) lexbuf
@@ -279,7 +264,7 @@ rule fixed_line state
279264
}
280265
and fixed_indicator state
281266
= parse
282-
| ' ' | '\t' (* second tab *) (* nominal *)
267+
| ' '
283268
{
284269
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
285270
}
@@ -370,7 +355,7 @@ and xopen_or_crt_or_acutrm_followup state
370355
}
371356
and cobolx_line state (* COBOLX format (GCOS) *)
372357
= parse
373-
| [' ' '\t'] (* nominal *)
358+
| ' ' (* nominal *)
374359
{
375360
fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
376361
}
@@ -603,7 +588,7 @@ and fixed_continue_quoted_ebcdics state
603588

604589
and free_line state
605590
= parse
606-
| blanks | '\t'
591+
| blanks
607592
{
608593
free_line state lexbuf
609594
}

src/lsp/cobol_preproc/src_lexing.ml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,27 @@ open Src_format
2020

2121
(* --- *)
2222

23+
let expand_tabs ?(tab_stop=8) ?(starting_col=0) src =
24+
match String.index_opt src '\t' with
25+
| None -> src
26+
| Some _ ->
27+
let buf = Buffer.create (String.length src) in
28+
let col = ref starting_col in
29+
let spaces = String.make tab_stop ' ' in
30+
String.iter (function
31+
| '\t' ->
32+
let n = (tab_stop - !col mod tab_stop) in
33+
Buffer.add_substring buf spaces 0 n;
34+
col := !col + 1
35+
| ('\n' | '\r') as c ->
36+
Buffer.add_char buf c;
37+
col := 0;
38+
| c ->
39+
Buffer.add_char buf c;
40+
incr col)
41+
src;
42+
Buffer.contents buf
43+
2344
let remove_blanks = Str.global_replace (Str.regexp " ") "" (* '\t'? *)
2445

2546
(* --- *)

src/lsp/cobol_preproc/src_lexing.mli

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
(* *)
1212
(**************************************************************************)
1313

14+
val expand_tabs: ?tab_stop:int -> ?starting_col:int -> string -> string
15+
16+
(** -- *)
17+
1418
type 'k state
1519

1620
val init_state: 'k Src_format.source_format -> 'k state

src/lsp/cobol_preproc/src_reader.ml

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,54 @@ let make make_lexing ?filename ~source_format input =
246246
Option.iter (Lexing.set_filename lexbuf) filename;
247247
Plx (Src_lexing.init_state source_format, lexbuf)
248248

249+
let from_channel_expanding_tabs ?with_positions ?(tab_stop = 8) ic : Lexing.lexbuf =
250+
let read_buf = Bytes.create 4096 in
251+
let read_pos = ref 0 in (* current position in read_buf *)
252+
let read_len = ref 0 in (* valid bytes in read_buf *)
253+
let col = ref 0 in (* current column (0-based) *)
254+
let refill buf len =
255+
let written = ref 0 in
256+
let rec loop () =
257+
if !written >= len then () (* lexer buffer full *)
258+
else begin
259+
(* Refill read_buf if exhausted *)
260+
if !read_pos >= !read_len then begin
261+
let n = input ic read_buf 0 (Bytes.length read_buf) in
262+
if n = 0 then () (* EOF *)
263+
else begin read_pos := 0; read_len := n; loop () end
264+
end else
265+
let c = Bytes.get read_buf !read_pos in
266+
if c = '\t' then begin
267+
let spaces = tab_stop - (!col mod tab_stop) in
268+
let n = min spaces (len - !written) in
269+
Bytes.fill buf !written n ' ';
270+
written := !written + n;
271+
col := !col + n;
272+
if n = spaces then (* fully expanded this tab *)
273+
incr read_pos
274+
(* else: partially expanded; we'll resume next refill *)
275+
end else begin
276+
Bytes.set buf !written c;
277+
incr written;
278+
incr read_pos;
279+
if c = '\n' then col := 0
280+
else col := !col + 1
281+
end;
282+
loop ()
283+
end
284+
in
285+
loop ();
286+
!written
287+
in
288+
Lexing.from_function ?with_positions refill
289+
249290
(* --- *)
250291

251292
let from_string = make Lexing.from_string
252293
let from_channel = make Lexing.from_channel
294+
let from_channel_no_tabs ?(tab_stop=8) =
295+
make (from_channel_expanding_tabs ~tab_stop)
296+
253297

254298
let fill buff ~lookup_len (input: Src_input.t) =
255299
match input with
@@ -276,9 +320,9 @@ let from ?source_format (input: Src_input.t) =
276320
let source_format, input = start_reading input ?source_format in
277321
match input with
278322
| String { contents; filename } ->
279-
from_string ~source_format ~filename contents
323+
from_string ~source_format ~filename (Src_lexing.expand_tabs contents)
280324
| Channel { ic; filename } ->
281-
from_channel ~source_format ~filename ic
325+
from_channel_no_tabs ~source_format ~filename ic
282326

283327
(* --- *)
284328

test/cobol_parsing/tokens.ml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,54 @@ let%expect_test "token-locations-with-missing-program-id" =
146146
WORD[x]@<prog.cob:11-20|11-21>
147147
.@<prog.cob:11-21|11-22>
148148
EOF@<prog.cob:11-22|11-22> |}];;
149+
150+
let%expect_test "tokens-with-tabs" =
151+
Parser_testing.show_parsed_tokens ~source_format:(SF SFFixed)
152+
~parser_options:(Parser_testing.options ~verbose:true ())
153+
{|
154+
IDENTIFICATION DIVISION.
155+
PROGRAM-ID. prog.
156+
PROCEDURE DIVISION.
157+
STRING W-AGT ";" W-RUBNUM (J) ";"
158+
W-RENVOINOTE W-DEST-NOM ";"
159+
W-DEST-RUE1 ";" W-DEST-RUE2 ";"
160+
W-DEST-CP ";" W-DEST-VILLE
161+
";" W-DEST-TEL1 (1:2) " "
162+
W-DEST-TEL1 (3:2) " "
163+
W-DEST-TEL1 (5:2) " "
164+
W-DEST-TEL1 (7:2) " "
165+
W-DEST-TEL1 (9:2)
166+
delimited by " " into LARTISAN.
167+
|};
168+
[%expect {|
169+
Tks: IDENTIFICATION, DIVISION, .
170+
Tks: PROGRAM-ID, ., INFO_WORD[prog], .
171+
Incoming: {RECURSIVE}
172+
Tks': ., INFO_WORD[prog], .
173+
Tks: PROCEDURE, DIVISION, .
174+
Outgoing: {RECURSIVE}
175+
Tks':
176+
Tks: STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";"
177+
Tks: WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";"
178+
Tks: WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";"
179+
Tks: WORD_IN_AREA_A[W-DEST-CP], ";"
180+
Tks: WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1], (, DIGITS[1], :, DIGITS[2],
181+
), " "
182+
Tks: WORD[W-DEST-TEL1], (, DIGITS[3], :, DIGITS[2], ), " "
183+
Tks: WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " "
184+
Tks: WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " "
185+
Tks: WORD[W-DEST-TEL1]
186+
Tks: (, DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, " ", INTO,
187+
WORD[LARTISAN], .
188+
Tks: EOF
189+
Tks':
190+
IDENTIFICATION, DIVISION, ., PROGRAM-ID, ., INFO_WORD[prog], ., PROCEDURE,
191+
DIVISION, ., STRING, WORD[W-AGT], ";", WORD[W-RUBNUM], (, WORD[J], ), ";",
192+
WORD_IN_AREA_A[W-RENVOINOTE], WORD[W-DEST-NOM], ";",
193+
WORD_IN_AREA_A[W-DEST-RUE1], ";", WORD[W-DEST-RUE2], ";",
194+
WORD_IN_AREA_A[W-DEST-CP], ";", WORD[W-DEST-VILLE], ";", WORD[W-DEST-TEL1],
195+
(, DIGITS[1], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[3], :,
196+
DIGITS[2], ), " ", WORD[W-DEST-TEL1], (, DIGITS[5], :, DIGITS[2], ), " ",
197+
WORD[W-DEST-TEL1], (, DIGITS[7], :, DIGITS[2], ), " ", WORD[W-DEST-TEL1], (,
198+
DIGITS[9], :, DIGITS[2], ), DELIMITED, BY, " ", INTO, WORD[LARTISAN], .,
199+
EOF |}];;

test/output-tests/used_binaries.expected

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:946:0
4848
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:962:0
4949
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:991:0
5050
Considering: import/gnucobol/tests/testsuite.src/used_binaries.at:1024:0
51-
used_binaries.at-1024-progprep.cob:5.36-5.37:
51+
used_binaries.at-1024-progprep.cob:5.47-5.48:
5252
2 IDENTIFICATION DIVISION.
5353
3 PROGRAM-ID. prog.
5454
4 DATA DIVISION.
5555
5 > WORKING-STORAGE SECTION. #
56-
---- ^
56+
----
5757
6 01 TEST-VAR PIC 9(2) VALUE 'A'.
5858
7 COPY 'CRUD.CPY'.
5959
>> Error: Invalid syntax

0 commit comments

Comments
 (0)