Skip to content

Commit b914e5b

Browse files
committed
Refactor frontend and error diagnostic
This patch completely separate the preprocessor functionality from scanner-less parser to indepadent units, which allows compiler to expand and parse nested function-like macro, multi-token object-like macro, and more. Furthermore, the error diagnostic is rewritten to better allow user to find out where and what lexeme causes compiler to panic.
1 parent c044948 commit b914e5b

24 files changed

+2541
-1549
lines changed

.github/workflows/main.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,34 @@ jobs:
8888
make check-sanitizer DYNLINK=${{ steps.determine-mode.outputs.DYNLINK }} || exit 1
8989
make check DYNLINK=${{ steps.determine-mode.outputs.DYNLINK }} || exit 1
9090
91+
preprocessor-host:
92+
runs-on: ubuntu-24.04
93+
strategy:
94+
matrix:
95+
compiler: [gcc, clang]
96+
architecture: [arm, riscv]
97+
steps:
98+
- name: Checkout code
99+
uses: actions/checkout@v4
100+
- name: Download dependencies
101+
run: |
102+
sudo apt-get update -q -y
103+
sudo apt-get install -q -y graphviz jq
104+
sudo apt-get install -q -y qemu-user
105+
sudo apt-get install -q -y build-essential
106+
- name: Configurate config
107+
run: |
108+
make distclean config ARCH=${{ matrix.architecture }}
109+
- name: Preprocess stage 1 source code
110+
env:
111+
CC: ${{ matrix.compiler }}
112+
run: |
113+
make out/shecc
114+
./out/shecc -E src/main.c > ./out/out.c
115+
- name: Build stage 1 artifact
116+
run: |
117+
./out/shecc --no-libc -o out/shecc-stage1.elf ./out/out.c
118+
91119
coding-style:
92120
runs-on: ubuntu-24.04
93121
steps:

COMPLIANCE.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ This document tracks compliance gaps and non-standard behaviors.
3737
- `#define` for object-like and function-like macros
3838
- `#ifdef`, `#ifndef`, `#if`, `#elif`, `#else`, `#endif`
3939
- `#undef` for macro removal
40+
- `#pragma once`, other `#pragma` options will be ignored
4041
- `defined()` operator
4142
- `__VA_ARGS__` for variadic macros
43+
- `__FILE__`, `__LINE__` built-in macros
4244

4345
## Missing Features
4446

@@ -83,15 +85,12 @@ This document tracks compliance gaps and non-standard behaviors.
8385

8486
| Feature | Status | Description |
8587
|---------|--------|-------------|
86-
| `#include` | Parsed only | No file inclusion |
88+
| `#include` | Parsed only | Local file inclusion is supported, but lack of capability too includes system files |
8789
| Token pasting (`##`) | Missing | Cannot concatenate tokens |
8890
| Stringizing (`#`) | Missing | Cannot convert to string |
89-
| `__FILE__` | Missing | No file name macro |
90-
| `__LINE__` | Missing | No line number macro |
9191
| `__DATE__` | Missing | No compile date |
9292
| `__TIME__` | Missing | No compile time |
9393
| `__STDC__` | Missing | No standard compliance indicator |
94-
| `#pragma` | Ignored | Accepted but no effect |
9594

9695
### Advanced Features
9796

lib/c.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,36 @@
99
#include "c.h"
1010
#define INT_BUF_LEN 16
1111

12+
#define __is_alpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
13+
#define __is_digit(c) ((c >= '0' && c <= '9'))
14+
#define __is_hex(c) \
15+
(__is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
16+
17+
int isdigit(int c)
18+
{
19+
return __is_digit(c);
20+
}
21+
22+
int isalpha(int c)
23+
{
24+
return __is_alpha(c);
25+
}
26+
27+
int isalnum(int c)
28+
{
29+
return __is_alpha(c) || __is_digit(c);
30+
}
31+
32+
int isxdigit(int c)
33+
{
34+
return __is_hex(c);
35+
}
36+
37+
int isblank(int c)
38+
{
39+
return c == ' ' || c == '\t';
40+
}
41+
1242
int strlen(char *str)
1343
{
1444
/* process the string by checking 4 characters (a 32-bit word) at a time */
@@ -543,6 +573,31 @@ int fputc(int c, FILE *stream)
543573
return c;
544574
}
545575

576+
int fseek(FILE *stream, int offset, int whence)
577+
{
578+
#if defined(__arm__)
579+
return __syscall(__syscall_lseek, stream, offset, whence);
580+
#elif defined(__riscv)
581+
/* No need to offset */
582+
return __syscall(__syscall_lseek, stream, 0, offset, NULL, whence);
583+
#else
584+
#error "Unsupported fseek support for current platform"
585+
#endif
586+
}
587+
588+
int ftell(FILE *stream)
589+
{
590+
#if defined(__arm__)
591+
return __syscall(__syscall_lseek, stream, 0, SEEK_CUR);
592+
#elif defined(__riscv)
593+
int result;
594+
__syscall(__syscall_lseek, stream, 0, 0, &result, SEEK_CUR);
595+
return result;
596+
#else
597+
#error "Unsupported ftell support for current platform"
598+
#endif
599+
}
600+
546601
#define CHUNK_SIZE_FREED_MASK 1
547602
#define CHUNK_SIZE_SZ_MASK 0xFFFFFFFE
548603
#define CHUNK_GET_SIZE(size) (size & CHUNK_SIZE_SZ_MASK)

lib/c.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@
1717
#define INT_MAX 0x7fffffff
1818
#define INT_MIN 0x80000000
1919

20+
#define SEEK_SET 0
21+
#define SEEK_CUR 1
22+
#define SEEK_END 2
23+
2024
#if defined(__arm__)
2125
#define __SIZEOF_POINTER__ 4
2226
#define __syscall_exit 1
2327
#define __syscall_read 3
2428
#define __syscall_write 4
2529
#define __syscall_close 6
2630
#define __syscall_open 5
31+
#define __syscall_lseek 19
2732
#define __syscall_mmap2 192
2833
#define __syscall_munmap 91
2934

@@ -35,6 +40,7 @@
3540
#define __syscall_close 57
3641
#define __syscall_open 1024
3742
#define __syscall_openat 56
43+
#define __syscall_lseek 62
3844
#define __syscall_mmap2 222
3945
#define __syscall_munmap 215
4046

@@ -52,13 +58,23 @@
5258
/* va_list support for variadic functions */
5359
typedef int *va_list;
5460

61+
/* Character predicate functions */
62+
63+
int isdigit(int c);
64+
int isalpha(int c);
65+
int isalnum(int c);
66+
int isxdigit(int c);
67+
int isblank(int c);
68+
5569
/* File I/O */
5670
typedef int FILE;
5771
FILE *fopen(char *filename, char *mode);
5872
int fclose(FILE *stream);
5973
int fgetc(FILE *stream);
6074
char *fgets(char *str, int n, FILE *stream);
6175
int fputc(int c, FILE *stream);
76+
int fseek(FILE *stream, int offset, int whence);
77+
int ftell(FILE *stream);
6278

6379
/* string-related functions */
6480
int strlen(char *str);

src/arm.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ arm_cond_t arm_get_cond(opcode_t op)
104104
case OP_leq:
105105
return __LE;
106106
default:
107-
error("Unsupported condition IR opcode");
107+
fatal("Unsupported condition IR opcode");
108108
}
109109
return __AL;
110110
}
@@ -113,7 +113,7 @@ int arm_extract_bits(int imm, int i_start, int i_end, int d_start, int d_end)
113113
{
114114
if (((d_end - d_start) != (i_end - i_start)) || (i_start > i_end) ||
115115
(d_start > d_end))
116-
error("Invalid bit copy");
116+
fatal("Invalid bit copy");
117117

118118
int v = imm >> i_start;
119119
v &= ((2 << (i_end - i_start)) - 1);
@@ -143,7 +143,7 @@ int __mov(arm_cond_t cond, int io, int opcode, int s, int rn, int rd, int op2)
143143
}
144144
if (op2 > 255)
145145
/* value spans more than 8 bits */
146-
error("Unable to represent value");
146+
fatal("Unable to represent value");
147147
}
148148
return arm_encode(cond, s + (opcode << 1) + (io << 5), rn, rd,
149149
(shift << 8) + (op2 & 255));
@@ -286,7 +286,7 @@ int arm_halfword_transfer(arm_cond_t cond,
286286
}
287287

288288
if (ofs > 255)
289-
error("Halfword offset too large");
289+
fatal("Halfword offset too large");
290290

291291
/* Halfword encoding: split offset into 4-bit high and low parts */
292292
int imm4H = ((ofs >> 4) & 0xF) << 8;

src/defs.h

Lines changed: 26 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
/* definitions */
1212

13+
/* Common macro functions */
14+
#define is_newline(c) (c == '\r' || c == '\n')
15+
1316
/* Limitations */
1417
#define MAX_TOKEN_LEN 256
1518
#define MAX_ID_LEN 64
@@ -26,15 +29,13 @@
2629
#define MAX_BB_DOM_SUCC 64
2730
#define MAX_BB_RDOM_SUCC 256
2831
#define MAX_GLOBAL_IR 256
29-
#define MAX_SOURCE 1048576
3032
#define MAX_CODE 262144
3133
#define MAX_DATA 262144
3234
#define MAX_SYMTAB 65536
3335
#define MAX_STRTAB 65536
3436
#define MAX_HEADER 1024
3537
#define MAX_PROGRAM_HEADER 1024
3638
#define MAX_SECTION 1024
37-
#define MAX_ALIASES 128
3839
#define MAX_SECTION_HEADER 1024
3940
#define MAX_SHSTR 1024
4041
#define MAX_INTERP 1024
@@ -56,7 +57,7 @@
5657
#define SMALL_ARENA_SIZE 65536 /* 64 KiB - for small allocations */
5758
#define LARGE_ARENA_SIZE 524288 /* 512 KiB - for instruction arena */
5859
#define DEFAULT_FUNCS_SIZE 64
59-
#define DEFAULT_INCLUSIONS_SIZE 16
60+
#define DEFAULT_SRC_FILE_COUNT 8
6061

6162
/* Arena compaction bitmask flags for selective memory reclamation */
6263
#define COMPACT_ARENA_BLOCK 0x01 /* BLOCK_ARENA - variables/blocks */
@@ -131,6 +132,7 @@ typedef struct {
131132
/* lexer tokens */
132133
typedef enum {
133134
T_start, /* FIXME: Unused, intended for lexer state machine init */
135+
T_eof, /* end-of-file (EOF) */
134136
T_numeric,
135137
T_identifier,
136138
T_comma, /* , */
@@ -179,7 +181,6 @@ typedef enum {
179181
T_question, /* ? */
180182
T_colon, /* : */
181183
T_semicolon, /* ; */
182-
T_eof, /* end-of-file (EOF) */
183184
T_ampersand, /* & */
184185
T_return,
185186
T_if,
@@ -211,38 +212,36 @@ typedef enum {
211212
T_cppd_endif,
212213
T_cppd_ifdef,
213214
T_cppd_ifndef,
214-
T_cppd_pragma
215-
} token_t;
215+
T_cppd_pragma,
216+
/* C pre-processor specific, these kinds
217+
* will be removed after pre-processing is done.
218+
*/
219+
T_newline,
220+
T_backslash,
221+
T_whitespace,
222+
T_tab
223+
} token_kind_t;
216224

217225
/* Source location tracking for better error reporting */
218226
typedef struct {
227+
int pos; /* raw source file position */
228+
int len; /* length of token */
219229
int line;
220230
int column;
221231
char *filename;
222232
} source_location_t;
223233

224-
/* Token structure with metadata for enhanced lexing */
225-
typedef struct token_info {
226-
token_t type;
227-
char value[MAX_TOKEN_LEN];
234+
typedef struct token {
235+
token_kind_t kind;
236+
char *literal;
228237
source_location_t location;
229-
struct token_info *next; /* For freelist management */
230-
} token_info_t;
231-
232-
/* Token freelist for memory reuse */
233-
typedef struct {
234-
token_info_t *freelist;
235-
int allocated_count;
236-
} token_pool_t;
238+
struct token *next;
239+
} token_t;
237240

238-
/* Token buffer for improved lookahead */
239-
#define TOKEN_BUFFER_SIZE 8
240-
typedef struct {
241-
token_info_t *tokens[TOKEN_BUFFER_SIZE];
242-
int head;
243-
int tail;
244-
int count;
245-
} token_buffer_t;
241+
typedef struct token_stream {
242+
token_t *head;
243+
token_t *tail;
244+
} token_stream_t;
246245

247246
/* String pool for identifier deduplication */
248247
typedef struct {
@@ -387,7 +386,7 @@ struct var {
387386
int in_loop;
388387
struct var *base;
389388
int subscript;
390-
struct var *subscripts[64];
389+
struct var *subscripts[128];
391390
int subscripts_idx;
392391
rename_t rename;
393392
ref_block_list_t ref_block_list; /* blocks which kill variable */
@@ -412,25 +411,13 @@ struct var {
412411
bool ofs_based_on_stack_top;
413412
};
414413

415-
typedef struct {
416-
char name[MAX_VAR_LEN];
417-
bool is_variadic;
418-
int start_source_idx;
419-
var_t param_defs[MAX_PARAMS];
420-
int num_param_defs;
421-
int params[MAX_PARAMS];
422-
int num_params;
423-
bool disabled;
424-
} macro_t;
425-
426414
typedef struct func func_t;
427415

428416
/* block definition */
429417
struct block {
430418
var_list_t locals;
431419
struct block *parent;
432420
func_t *func;
433-
macro_t *macro;
434421
struct block *next;
435422
};
436423

@@ -494,13 +481,6 @@ typedef struct {
494481
type_t *type;
495482
} lvalue_t;
496483

497-
/* alias for #defines */
498-
typedef struct {
499-
char alias[MAX_VAR_LEN];
500-
char value[MAX_VAR_LEN];
501-
bool disabled;
502-
} alias_t;
503-
504484
/* constants for enums */
505485
typedef struct {
506486
char alias[MAX_VAR_LEN];

0 commit comments

Comments
 (0)