Skip to content

Commit 73552dd

Browse files
committed
Refactor frontend and error diagnostic
This patch completely separate the preprocessor functionality from scanner-less parser to indepadent units, which allows compiler to expand and parse nested function-like macro, multi-token object-like macro, and more. Furthermore, the error diagnostic is rewritten to better allow user to find out where and what lexeme causes compiler to panic.
1 parent 60dccd5 commit 73552dd

File tree

19 files changed

+2770
-1470
lines changed

19 files changed

+2770
-1470
lines changed

.github/workflows/main.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,45 @@ jobs:
5656
make check-sanitizer || exit 1
5757
make check || exit 1
5858
59+
preprocessor-host:
60+
runs-on: ubuntu-24.04
61+
strategy:
62+
matrix:
63+
compiler: [gcc, clang]
64+
architecture: [arm, riscv]
65+
steps:
66+
- name: Checkout code
67+
uses: actions/checkout@v4
68+
- name: Download dependencies
69+
run: |
70+
sudo apt-get update -q -y
71+
sudo apt-get install -q -y graphviz jq
72+
sudo apt-get install -q -y qemu-user
73+
sudo apt-get install -q -y build-essential
74+
- name: Configurate config
75+
run: |
76+
make distclean config ARCH=${{ matrix.architecture }}
77+
- name: Preprocess stage 1 source code
78+
env:
79+
CC: ${{ matrix.compiler }}
80+
run: |
81+
make out/shecc
82+
./out/shecc -E src/main.c > ./out/out.c
83+
- name: Build stage 1 artifact
84+
run: |
85+
./out/shecc --no-libc -o out/shecc-stage1.elf ./out/out.c
86+
chmod a+x ./out/shecc-stage1.elf
87+
- name: Preprocess stage 2 source code
88+
run: |
89+
./out/shecc-stage1.elf -E src/main.c > ./out/out.c
90+
- name: Build stage 2 artifact
91+
run: |
92+
./out/shecc-stage1.elf --no-libc -o out/shecc-stage2.elf ./out/out.c
93+
chmod a+x ./out/shecc-stage2.elf
94+
- name: Test stage 2 artifact
95+
run: |
96+
make check-stage2 || exit 1
97+
5998
coding-style:
6099
runs-on: ubuntu-24.04
61100
steps:

lib/c.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,18 @@
1616
#define INT_MAX 0x7fffffff
1717
#define INT_MIN 0x80000000
1818

19+
#define SEEK_SET 0
20+
#define SEEK_CUR 1
21+
#define SEEK_END 2
22+
1923
#if defined(__arm__)
2024
#define __SIZEOF_POINTER__ 4
2125
#define __syscall_exit 1
2226
#define __syscall_read 3
2327
#define __syscall_write 4
2428
#define __syscall_close 6
2529
#define __syscall_open 5
30+
#define __syscall_lseek 19
2631
#define __syscall_mmap2 192
2732
#define __syscall_munmap 91
2833

@@ -34,6 +39,7 @@
3439
#define __syscall_close 57
3540
#define __syscall_open 1024
3641
#define __syscall_openat 56
42+
#define __syscall_lseek 62
3743
#define __syscall_mmap2 222
3844
#define __syscall_munmap 215
3945

@@ -584,6 +590,30 @@ int fputc(int c, FILE *stream)
584590
return c;
585591
}
586592

593+
int fseek(FILE *stream, int offset, int whence)
594+
{
595+
#if defined(__arm__)
596+
__syscall(__syscall_lseek, stream, offset, whence);
597+
#elif defined(__riscv)
598+
/* No need to offset */
599+
__syscall(__syscall_lseek, stream, 0, offset, NULL, whence);
600+
#endif
601+
return 0;
602+
}
603+
604+
int ftell(FILE *stream)
605+
{
606+
#if defined(__arm__)
607+
return __syscall(__syscall_lseek, stream, 0, SEEK_CUR);
608+
#elif defined(__riscv)
609+
int result;
610+
__syscall(__syscall_lseek, stream, 0, 0, &result, SEEK_CUR);
611+
return result;
612+
#else
613+
#error "Unsupported ftell support for current platform"
614+
#endif
615+
}
616+
587617
/* Non-portable: Assume page size is 4KiB */
588618
#define PAGESIZE 4096
589619

src/defs.h

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@
1010

1111
/* definitions */
1212

13+
#define DEBUG_BUILD false
14+
15+
/* Common macro functions */
16+
#define is_whitespace(c) (c == ' ' || c == '\t')
17+
#define is_newline(c) (c == '\r' || c == '\n')
18+
#define is_alnum(c) \
19+
((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || \
20+
(c >= '0' && c <= '9') || (c == '_'))
21+
#define is_digit(c) ((c >= '0' && c <= '9'))
22+
#define is_hex(c) \
23+
(is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
24+
1325
/* Limitations */
1426
#define MAX_TOKEN_LEN 256
1527
#define MAX_ID_LEN 64
@@ -26,14 +38,12 @@
2638
#define MAX_BB_DOM_SUCC 64
2739
#define MAX_BB_RDOM_SUCC 256
2840
#define MAX_GLOBAL_IR 256
29-
#define MAX_SOURCE 1048576
3041
#define MAX_CODE 262144
3142
#define MAX_DATA 262144
3243
#define MAX_SYMTAB 65536
3344
#define MAX_STRTAB 65536
3445
#define MAX_HEADER 1024
3546
#define MAX_SECTION 1024
36-
#define MAX_ALIASES 128
3747
#define MAX_CONSTANTS 1024
3848
#define MAX_CASES 128
3949
#define MAX_NESTING 128
@@ -46,7 +56,7 @@
4656
#define SMALL_ARENA_SIZE 65536 /* 64 KiB - for small allocations */
4757
#define LARGE_ARENA_SIZE 524288 /* 512 KiB - for instruction arena */
4858
#define DEFAULT_FUNCS_SIZE 64
49-
#define DEFAULT_INCLUSIONS_SIZE 16
59+
#define DEFAULT_SRC_FILE_COUNT 8
5060

5161
/* Arena compaction bitmask flags for selective memory reclamation */
5262
#define COMPACT_ARENA_BLOCK 0x01 /* BLOCK_ARENA - variables/blocks */
@@ -113,6 +123,7 @@ typedef struct {
113123
/* lexer tokens */
114124
typedef enum {
115125
T_start, /* FIXME: Unused, intended for lexer state machine init */
126+
T_eof, /* end-of-file (EOF) */
116127
T_numeric,
117128
T_identifier,
118129
T_comma, /* , */
@@ -161,7 +172,6 @@ typedef enum {
161172
T_question, /* ? */
162173
T_colon, /* : */
163174
T_semicolon, /* ; */
164-
T_eof, /* end-of-file (EOF) */
165175
T_ampersand, /* & */
166176
T_return,
167177
T_if,
@@ -193,38 +203,36 @@ typedef enum {
193203
T_cppd_endif,
194204
T_cppd_ifdef,
195205
T_cppd_ifndef,
196-
T_cppd_pragma
197-
} token_t;
206+
T_cppd_pragma,
207+
/* C pre-processor specific, these kinds
208+
* will be removed after pre-processing is done.
209+
*/
210+
T_newline,
211+
T_backslash,
212+
T_whitespace,
213+
T_tab
214+
} token_kind_t;
198215

199216
/* Source location tracking for better error reporting */
200217
typedef struct {
218+
int pos; /* raw source file position */
219+
int len; /* length of token */
201220
int line;
202221
int column;
203222
char *filename;
204223
} source_location_t;
205224

206-
/* Token structure with metadata for enhanced lexing */
207-
typedef struct token_info {
208-
token_t type;
209-
char value[MAX_TOKEN_LEN];
225+
typedef struct token {
226+
token_kind_t kind;
227+
char *literal;
210228
source_location_t location;
211-
struct token_info *next; /* For freelist management */
212-
} token_info_t;
213-
214-
/* Token freelist for memory reuse */
215-
typedef struct {
216-
token_info_t *freelist;
217-
int allocated_count;
218-
} token_pool_t;
229+
struct token *next;
230+
} token_t;
219231

220-
/* Token buffer for improved lookahead */
221-
#define TOKEN_BUFFER_SIZE 8
222-
typedef struct {
223-
token_info_t *tokens[TOKEN_BUFFER_SIZE];
224-
int head;
225-
int tail;
226-
int count;
227-
} token_buffer_t;
232+
typedef struct token_stream {
233+
token_t *head;
234+
token_t *tail;
235+
} token_stream_t;
228236

229237
/* String pool for identifier deduplication */
230238
typedef struct {
@@ -369,7 +377,7 @@ struct var {
369377
int in_loop;
370378
struct var *base;
371379
int subscript;
372-
struct var *subscripts[64];
380+
struct var *subscripts[128];
373381
int subscripts_idx;
374382
rename_t rename;
375383
ref_block_list_t ref_block_list; /* blocks which kill variable */
@@ -388,25 +396,13 @@ struct var {
388396
int use_count; /* Number of times variable is used */
389397
};
390398

391-
typedef struct {
392-
char name[MAX_VAR_LEN];
393-
bool is_variadic;
394-
int start_source_idx;
395-
var_t param_defs[MAX_PARAMS];
396-
int num_param_defs;
397-
int params[MAX_PARAMS];
398-
int num_params;
399-
bool disabled;
400-
} macro_t;
401-
402399
typedef struct func func_t;
403400

404401
/* block definition */
405402
struct block {
406403
var_list_t locals;
407404
struct block *parent;
408405
func_t *func;
409-
macro_t *macro;
410406
struct block *next;
411407
};
412408

@@ -460,13 +456,6 @@ typedef struct {
460456
type_t *type;
461457
} lvalue_t;
462458

463-
/* alias for #defines */
464-
typedef struct {
465-
char alias[MAX_VAR_LEN];
466-
char value[MAX_VAR_LEN];
467-
bool disabled;
468-
} alias_t;
469-
470459
/* constants for enums */
471460
typedef struct {
472461
char alias[MAX_VAR_LEN];

0 commit comments

Comments
 (0)