diff --git a/BUILD.md b/BUILD.md index 37f84ea2f..c35fa0cf2 100644 --- a/BUILD.md +++ b/BUILD.md @@ -63,6 +63,9 @@ The configure script has many options (to see them all, run * `--enable-ocaml` Build re2ocaml (identical to `re2c --lang ocaml`). This is on by default. + * `--enable-php` + Build re2php (identical to `re2c --lang php`). This is on by default. + * `--enable-python` Build re2py (identical to `re2c --lang python`). This is on by default. diff --git a/CMakeLists.txt b/CMakeLists.txt index ced463875..37292362f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ option(RE2C_BUILD_RE2HS "Build re2hs executable (an alias for `re2c --lang haske option(RE2C_BUILD_RE2JAVA "Build re2java executable (an alias for `re2c --lang java`)" ON) option(RE2C_BUILD_RE2JS "Build re2js executable (an alias for `re2c --lang js`)" ON) option(RE2C_BUILD_RE2OCAML "Build re2ocaml executable (an alias for `re2c --lang ocaml`)" ON) +option(RE2C_BUILD_RE2PHP "Build re2php executable (an alias for `re2c --lang php`)" ON) option(RE2C_BUILD_RE2PY "Build re2py executable (an alias for `re2c --lang python`)" ON) option(RE2C_BUILD_RE2RUST "Build re2rust executable (an alias for `re2c --lang rust`)" ON) option(RE2C_BUILD_RE2V "Build re2v executable (an alias for `re2c --lang v`)" ON) @@ -92,6 +93,7 @@ set(re2c_manpage_haskell "${CMAKE_CURRENT_BINARY_DIR}/doc/re2hs.1") set(re2c_manpage_java "${CMAKE_CURRENT_BINARY_DIR}/doc/re2java.1") set(re2c_manpage_js "${CMAKE_CURRENT_BINARY_DIR}/doc/re2js.1") set(re2c_manpage_ocaml "${CMAKE_CURRENT_BINARY_DIR}/doc/re2ocaml.1") +set(re2c_manpage_php "${CMAKE_CURRENT_BINARY_DIR}/doc/re2php.1") set(re2c_manpage_python "${CMAKE_CURRENT_BINARY_DIR}/doc/re2py.1") set(re2c_manpage_rust "${CMAKE_CURRENT_BINARY_DIR}/doc/re2rust.1") set(re2c_manpage_v "${CMAKE_CURRENT_BINARY_DIR}/doc/re2v.1") @@ -103,6 +105,7 @@ set(re2c_help_haskell "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2hs.cc") set(re2c_help_java "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2java.cc") set(re2c_help_js "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2js.cc") set(re2c_help_ocaml "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2ocaml.cc") +set(re2c_help_php "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2php.cc") set(re2c_help_python "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2py.cc") set(re2c_help_rust "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2rust.cc") set(re2c_help_v "${CMAKE_CURRENT_BINARY_DIR}/src/msg/help_re2v.cc") @@ -172,6 +175,7 @@ add_library(re2c_objects_autogen OBJECT "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_java.h" "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_js.h" "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_ocaml.h" + "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_php.h" "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_python.h" "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_rust.h" "${CMAKE_CURRENT_BINARY_DIR}/src/default_syntax_v.h" @@ -260,6 +264,7 @@ re2c_bootstrap_syntax("include/syntax/haskell" "src/default_syntax_haskell.h") re2c_bootstrap_syntax("include/syntax/java" "src/default_syntax_java.h") re2c_bootstrap_syntax("include/syntax/js" "src/default_syntax_js.h") re2c_bootstrap_syntax("include/syntax/ocaml" "src/default_syntax_ocaml.h") +re2c_bootstrap_syntax("include/syntax/php" "src/default_syntax_php.h") re2c_bootstrap_syntax("include/syntax/python" "src/default_syntax_python.h") re2c_bootstrap_syntax("include/syntax/rust" "src/default_syntax_rust.h") re2c_bootstrap_syntax("include/syntax/v" "src/default_syntax_v.h") @@ -344,6 +349,17 @@ if (RE2C_BUILD_RE2OCAML) ) endif() +# re2php +if (RE2C_BUILD_RE2PHP) + re2c_gen_help("${re2c_help_source}" "${re2c_help_php}") + re2c_gen_manpage("${re2c_manpage_source}" "${re2c_manpage_php}") + add_executable(re2php ${re2c_sources} "${re2c_help_php}") + target_compile_definitions(re2php PUBLIC + "RE2C_LANG=Lang::PHP" + "RE2C_PROG=\"re2php\"" + ) +endif() + # re2py if (RE2C_BUILD_RE2PY) re2c_gen_help("${re2c_help_source}" "${re2c_help_python}") diff --git a/Makefile.am b/Makefile.am index dac11e256..4b85255e6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -149,6 +149,7 @@ re2c_GEN_STX = \ src/default_syntax_java.h \ src/default_syntax_js.h \ src/default_syntax_ocaml.h \ + src/default_syntax_php.h \ src/default_syntax_python.h \ src/default_syntax_rust.h \ src/default_syntax_v.h \ @@ -173,6 +174,7 @@ re2c_BOOT = \ bootstrap/doc/re2java.1 \ bootstrap/doc/re2js.1 \ bootstrap/doc/re2ocaml.1 \ + bootstrap/doc/re2php.1 \ bootstrap/doc/re2py.1 \ bootstrap/doc/re2rust.1 \ bootstrap/doc/re2v.1 \ @@ -184,6 +186,7 @@ re2c_BOOT = \ bootstrap/src/default_syntax_java.h \ bootstrap/src/default_syntax_js.h \ bootstrap/src/default_syntax_ocaml.h \ + bootstrap/src/default_syntax_php.h \ bootstrap/src/default_syntax_python.h \ bootstrap/src/default_syntax_rust.h \ bootstrap/src/default_syntax_v.h \ @@ -196,6 +199,7 @@ re2c_BOOT = \ bootstrap/src/msg/help_re2js.cc \ bootstrap/src/msg/help_re2ocaml.cc \ bootstrap/src/msg/help_re2py.cc \ + bootstrap/src/msg/help_re2php.cc \ bootstrap/src/msg/help_re2rust.cc \ bootstrap/src/msg/help_re2v.cc \ bootstrap/src/msg/help_re2zig.cc \ @@ -234,6 +238,7 @@ re2c_SRC_DOC_EXT = \ doc/manual/basics/api/api2_rust.rst_ \ doc/manual/basics/api/api2_v.rst_ \ doc/manual/basics/api/api2_zig.rst_ \ + doc/manual/basics/api/api2_php.rst_ \ doc/manual/basics/api/api3.rst_ \ doc/manual/basics/blocks.rst_ \ doc/manual/basics/directives.rst_ \ @@ -440,6 +445,29 @@ re2c_SRC_DOC_EXT = \ examples/ocaml/submatch/02_mtags.re \ examples/ocaml/submatch/03_captures.re \ examples/ocaml/submatch/04_posix_captures.re \ + examples/php/01_basic.php \ + examples/php/01_basic.re \ + examples/php/conditions/parse_u32_blocks.re \ + examples/php/conditions/parse_u32_conditions.re \ + examples/php/encodings/unicode_identifier.re \ + examples/php/eof/01_sentinel.re \ + examples/php/eof/02_bounds_checking.re \ + examples/php/eof/03_eof_rule.re \ + examples/php/eof/04_fake_sentinel.re \ + examples/php/fill/01_fill.re \ + examples/php/fill/02_fill.re \ + examples/php/headers/header.re \ + examples/php/headers/lexer/state.php \ + examples/php/includes/definitions.php \ + examples/php/includes/include.re \ + examples/php/reuse/reuse.re \ + examples/php/reuse/usedir.re \ + examples/php/state/push.re \ + examples/php/submatch/01_stags_fill.re \ + examples/php/submatch/01_stags.re \ + examples/php/submatch/02_mtags.re \ + examples/php/submatch/03_captures.re \ + examples/php/submatch/04_posix_captures.re \ examples/python/01_basic.py \ examples/python/01_basic.re \ examples/python/conditions/parse_u32_blocks.re \ @@ -560,6 +588,10 @@ if WITH_OCAML DOCS += doc/re2ocaml.1 HELP += src/msg/help_re2ocaml.cc endif +if WITH_PHP +DOCS += doc/re2php.1 +HELP += src/msg/help_re2php.cc +endif if WITH_PYTHON DOCS += doc/re2py.1 HELP += src/msg/help_re2py.cc @@ -589,6 +621,7 @@ dist_stdlib_DATA = \ include/syntax/java \ include/syntax/js \ include/syntax/ocaml \ + include/syntax/php \ include/syntax/python \ include/syntax/rust \ include/syntax/v \ @@ -822,6 +855,14 @@ re2ocaml_SOURCES = $(re2c_SOURCES) nodist_re2ocaml_SOURCES = $(re2c_GEN) src/msg/help_re2ocaml.cc endif +# re2php +if WITH_PHP +bin_PROGRAMS += re2php +re2php_CXXFLAGS = $(AM_CXXFLAGS) -DRE2C_LANG=Lang::PHP -DRE2C_PROG=\"re2php\" +re2php_SOURCES = $(re2c_SOURCES) +nodist_re2php_SOURCES = $(re2c_GEN) src/msg/help_re2php.cc +endif + # re2py if WITH_PYTHON bin_PROGRAMS += re2py diff --git a/bootstrap/doc/re2php.1 b/bootstrap/doc/re2php.1 new file mode 100644 index 000000000..c5db6041f --- /dev/null +++ b/bootstrap/doc/re2php.1 @@ -0,0 +1,4038 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "RE2JS" 1 "" "" +.SH NAME +re2js \- generate fast lexical analyzers for JavaScript +.SH SYNOPSIS +.sp +re2js \fB[ OPTIONS ]\fP \fB[ WARNINGS ]\fP \fBINPUT\fP +.sp +Input can be either a file or \fB\-\fP for stdin. +.SH INTRODUCTION +.sp +re2js works as a preprocessor. It reads the input file (which is usually a +program in JavaScript, but can be anything) and looks for blocks of code +enclosed in special\-form start/end markers. The text outside of these blocks is +copied verbatim into the output file. The contents of the blocks are processed +by re2js\&. It translates them to code in JavaScript and outputs the generated +code in place of the block. +.sp +Here is an example of a small program that checks if a given string contains a +decimal number: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +function lex(yyinput) { + let yycursor = 0; + /*!re2c + re2c:yyfill:enable = 0; + + [1\-9][0\-9]* { return true; } + * { return false; } + */ +} + +if (!lex(\(dq1234\e0\(dq)) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +In the output re2js replaced the block in the middle with the generated code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2js +// re2js $INPUT \-o $OUTPUT + +function lex(yyinput) { + let yycursor = 0; + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + yycursor += 1; + switch (yych) { + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yystate = 2 + continue yyl + default: + yystate = 1 + continue yyl + } + case 1: + { return false; } + case 2: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 2 + continue yyl + default: + yystate = 3 + continue yyl + } + case 3: + { return true; } + default: + throw \(dqinternal lexer error\(dq + } + } +} + +} + +if (!lex(\(dq1234\e0\(dq)) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BASICS +.sp +A re2js program consists of a sequence of \fIblocks\fP intermixed with code in the +target language. A block may contain \fIdefinitions\fP, \fIconfigurations\fP, \fIrules\fP +and \fIdirectives\fP in any order: +.INDENT 0.0 +.TP +.B \fB = ;\fP +A \fIdefinition\fP binds a name to a regular expression. Names may contain +alphanumeric characters and underscore. The \fI\%regular expressions\fP section +gives an overview of re2js syntax for regular expressions. Once defined, +the name can be used in other regular expressions and in rules. Recursion in +named definitions is not allowed, and each name should be defined before it +is used. A block inherits named definitions from the global scope. +Redefining a name that exists in the current scope is an error. +.TP +.B \fB = ;\fP +A \fIconfiguration\fP allows one to change re2js behavior and customize the +generated code. For a full list of configurations supported by re2js see +the \fI\%configurations\fP section. Depending on a particular configuration, the +value can be a keyword, a nonnegative integer number or a one\-line string +which should be enclosed in double or single quotes unless it consists of +alphanumeric characters. A block inherits configurations from the global +scope and may redefine them or add new ones. Configurations defined inside +of a block affect the whole block, even if they appear at the end of it. +.TP +.B \fB { }\fP +A \fIrule\fP binds a regular expression to a semantic action (a block of code in +the target language). If the regular expression matches, the associated +semantic action is executed. If multiple rules match, the longest match +takes precedence. If multiple rules match the same string, the earliest one +takes precedence. There are two special rules: the default rule \fB*\fP and +the end of input rule \fB$\fP\&. The default rule should always be defined, it +has the lowest priority regardless of its place in the block, and it matches +any code unit (not necessarily a valid character, see the +\fI\%encoding support\fP section). The end of input rule should be defined if the +corresponding method for \fI\%handling the end of input\fP is used. If +\fI\%start conditions\fP are used, rules have more complex syntax. +.TP +.B \fB!;\fP +A \fIdirective\fP is one of the special predefined statements. Each directive +has a unique purpose. For example, the \fB!use\fP directive merges a rules +block into the current one (see the \fI\%reusable blocks\fP section), and the +\fB!include\fP directive allows one to include an outer file (see the +\fI\%include files\fP section). +.UNINDENT +.SS Blocks +.sp +Block start and end markers are either \fB/*!re2c\fP and \fB*/\fP, or \fB%{\fP and +\fB%}\fP (both styles are supported). Starting from version 2.2 blocks may have +optional names that allow them to be referenced in other blocks. +There are different kinds of blocks: +.INDENT 0.0 +.TP +.B \fB/*!re2c[:] ... */\fP or \fB%{[:] ... %}\fP +A \fIglobal block\fP contains definitions, configurations, rules and directives. +re2js compiles regular expressions associated with each rule into a +deterministic finite automaton, encodes it in the form of conditional jumps +in the target language and replaces the block with the generated code. Names +and configurations defined in a global block are added to the global scope +and become visible to subsequent blocks. At the start of the program the +global scope is initialized with command\-line \fI\%options\fP\&. +.TP +.B \fB/*!local:re2c[:] ... */\fP or \fB%{local[:] ... %}\fP +A \fIlocal block\fP is like a global block, but the names and configurations in +it have local scope (they do not affect other blocks). +.TP +.B \fB/*!rules:re2c[:] ... */\fP or \fB%{rules[:] ... %}\fP +A \fIrules block\fP is like a local block, but it does not generate any code by +itself, nor does it add any definitions to the global scope \-\- it is meant +to be reused in other blocks. This is a way of sharing code (more details in +the \fI\%reusable blocks\fP section). Prior to re2js version 2.2 rules blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!use:re2c[:] ... */\fP or \fB%{use[:] ... %}\fP +A use block that references a previously defined rules block. If the name is +specified, re2js looks for a rules blocks with this name. Otherwise the most +recent rules block is used (either a named or an unnamed one). A use block +can add definitions, configurations and rules of its own, which are added to +those of the referenced rules block. Prior to re2js version 2.2 use blocks +required \fB\-r \-\-reusable\fP option. +.TP +.B \fB/*!max:re2c[:[:...]] ... */\fP or \fB%{max[:[:...]] ... %}\fP +A block that generates \fBYYMAXFILL\fP definition. An optional list of block +names specifies which blocks should be included when computing \fBYYMAXFILL\fP +value (if the list is empty, all blocks are included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXFILL \fP), or a global variable for Go +(\fBvar YYMAXFILL int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXFILL\fP\&. +.TP +.B \fB/*!maxnmatch:re2c[:[:...]] ... */\fP or \fB%{maxnmatch[:[:...]] ... %}\fP +A block that generates \fBYYMAXNMATCH\fP definition (it requires +\fB\-P \-\-posix\-captures\fP option). An optional list of block names specifies +which blocks should be included when computing \fBYYMAXNMATCH\fP value (if the +list is empty, all blocks are included). +By default the generated code is a macro\-definition for C +(\fB#define YYMAXNMATCH \fP), or a global variable for Go +(\fBvar YYMAXNMATCH int = \fP). It can be customized with an optional +configuration \fBformat\fP that specifies a template string where \fB@@{max}\fP +(or \fB@@\fP for short) is replaced with the numeric value of \fBYYMAXNMATCH\fP\&. +.TP +.B \fB/*!stags:re2c[:[:...]] ... */\fP, \fB/*!mtags:re2c[:[:...]] ... */\fP or \fB%{stags[:[:...]] ... %}\fP, \fB%{mtags[:[:...]] ... %{\fP +Blocks that specify a template piece of code that is expanded for each +s\-tag/m\-tag variable generated by re2js\&. An optional list of block names +specifies which blocks should be included when computing the set of tag +variables (if the list is empty, all blocks are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag variable. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tag variables. +.TP +.B \fB/*!svars:re2c[:[:...]] ... */\fP, \fB/*!mvars:re2c[:[:...]] ... */\fP or \fB%{svars[:[:...]] ... %}\fP, \fB%{mvars[:[:...]] ... %{\fP +Blocks that specify a template piece of code that is expanded for each +s\-tag/m\-tag that is either explicitly mentioned by the rules (with +\fB\-\-tags\fP option) or implicitly generated by re2js (with \fB\-\-captvars\fP or +\fB\-\-posix\-captvars\fP options). An optional list of block names specifies +which blocks should be included when computing the set of tags (if the list +is empty, all blocks are included). +There are two optional configurations: \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{tag}\fP (or +\fB@@\fP for short) is replaced with the name of each tag. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different tags. +.TP +.B \fB/*!getstate:re2c[:[:...]] ... */\fP or \fB%{getstate[:[:...]] ... %}\fP +A block that generates conditional dispatch on the lexer state (it requires +\fB\-\-storable\-state\fP option). An optional list of block names specifies +which blocks should be included in the state dispatch. The default +transition goes to the start label of the first block on the list. If the +list is empty, all blocks are included, and the default transition goes to +the first block in the file that has a start label. +This block type is incompatible with the \fB\-\-loop\-switch\fP option, as it +requires cross\-block transitions that are unsupported without \fBgoto\fP or +function calls. +.TP +.B \fB/*!conditions:re2c[:[:...]] ... */\fP, \fB/*!types:re2c... */\fP or \fB%{conditions[:[:...]] ... %}\fP, \fB%{types... %}\fP +A block that generates condition enumeration (it requires \fB\-\-conditions\fP +option). An optional list of block names specifies which blocks should be +included when computing the set of conditions (if the list is empty, all +blocks are included). +By default the generated code is an enumeration \fBYYCONDTYPE\fP\&. It can be +customized with optional configurations \fBformat\fP and \fBseparator\fP\&. +Configuration \fBformat\fP specifies a template string where \fB@@{cond}\fP (or +\fB@@\fP for short) is replaced with the name of each condition, and +\fB@@{num}\fP is replaced with a numeric index of that condition. +Configuration \fBseparator\fP specifies a piece of code used to join the +generated \fBformat\fP pieces for different conditions. +.TP +.B \fB/*!include:re2c */\fP or \fB%{include %}\fP +This block allows one to include \fB\fP, which must be a double\-quoted +file path. The contents of the file are literally substituted in place of +the block, in the same way as \fB#include\fP works in C/C++. This block can be +used together with the \fB\-\-depfile\fP option to generate build system +dependencies on the included files. +.TP +.B \fB/*!header:re2c:on*/\fP or \fB%{header:on %}\fP +This block marks the start of header file. Everything after it and up to the +following \fBheader:off\fP block is processed by re2js and written to the +header file specified with \fB\-t \-\-type\-header\fP option. +.TP +.B \fB/*!header:re2c:off*/\fP or \fB%{header:off %}\fP +This block marks the end of header file started with \fBheader:on*/\fP block. +.TP +.B \fB/*!ignore:re2c ... */\fP or \fB%{ignore ... %}\fP +A block which contents are ignored and removed from the output file. +.UNINDENT +.SS Directives +.sp +Here is a full list of directives supported by re2js: +.INDENT 0.0 +.TP +.B \fB!use:;\fP +An in\-block use directive that merges a previously defined rules block with +the specified name into the current block. Named definitions, configurations +and rules of the referenced block are added to the current ones. Conflicts +between overlapping rules and configurations are resolved in the usual way: +the first rule takes priority, and the latest configuration overrides the +preceding ones. One exception is the special rules \fB*\fP, \fB$\fP and \fB\fP +for which a block\-local definition always takes priority. A use directive +can be placed anywhere inside of a block, and multiple use directives are +allowed. +.TP +.B \fB!include ;\fP +This directive is the same as \fBinclude\fP block: it inserts \fB\fP +contents verbatim in place of the durective. +.UNINDENT +.SS Regular expressions +.sp +re2js uses the following syntax for regular expressions: +.INDENT 0.0 +.TP +.B \fB\(dqfoo\(dq\fP +Case\-sensitive string literal. +.TP +.B \fB\(aqfoo\(aq\fP +Case\-insensitive string literal. +.TP +.B \fB[a\-xyz]\fP, \fB[^a\-xyz]\fP +Character class (possibly negated). +.TP +.B \fB\&.\fP +Any character except newline. +.TP +.B \fBR \e S\fP +Difference of character classes \fBR\fP and \fBS\fP\&. +.TP +.B \fBR*\fP +Zero or more occurrences of \fBR\fP\&. +.TP +.B \fBR+\fP +One or more occurrences of \fBR\fP\&. +.TP +.B \fBR?\fP +Optional \fBR\fP\&. +.TP +.B \fBR{n}\fP +Repetition of \fBR\fP exactly \fBn\fP times. +.TP +.B \fBR{n,}\fP +Repetition of \fBR\fP at least \fBn\fP times. +.TP +.B \fBR{n,m}\fP +Repetition of \fBR\fP from \fBn\fP to \fBm\fP times. +.TP +.B \fB(R)\fP +Just \fBR\fP; parentheses are used to override precedence. If submatch +extraction is enabled, \fB(R)\fP is a capturing or a non\-capturing group +depending on \fB\-\-invert\-captures\fP option. +.TP +.B \fB(!R)\fP +If submatch extraction is enabled, \fB(!R)\fP is a non\-capturing or a +capturing group depending on \fB\-\-invert\-captures\fP option. +.TP +.B \fBR S\fP +Concatenation: \fBR\fP followed by \fBS\fP\&. +.TP +.B \fBR | S\fP +Alternative: \fBR or S\fP\&. +.TP +.B \fBR / S\fP +Lookahead: \fBR\fP followed by \fBS\fP, but \fBS\fP is not consumed. +.TP +.B \fBname\fP +Regular expression defined as \fBname\fP (or literal string \fB\(dqname\(dq\fP in +Flex compatibility mode). +.TP +.B \fB{name}\fP +Regular expression defined as \fBname\fP in Flex compatibility mode. +.TP +.B \fB@stag\fP +An \fIs\-tag\fP: saves the last input position at which \fB@stag\fP matches in a +variable named \fBstag\fP\&. +.TP +.B \fB#mtag\fP +An \fIm\-tag\fP: saves all input positions at which \fB#mtag\fP matches in a +variable named \fBmtag\fP\&. +.UNINDENT +.sp +Character classes and string literals may contain the following escape +sequences: \fB\ea\fP, \fB\eb\fP, \fB\ef\fP, \fB\en\fP, \fB\er\fP, \fB\et\fP, \fB\ev\fP, \fB\e\e\fP, +octal escapes \fB\eooo\fP and hexadecimal escapes \fB\exhh\fP, \fB\euhhhh\fP and +\fB\eUhhhhhhhh\fP\&. +.SS Configurations +.sp +Here is a full list of configurations supported by re2js: +.INDENT 0.0 +.TP +.B \fBre2c:api\fP, \fBre2c:input\fP +Same as the \fB\-\-api\fP option. +.TP +.B \fBre2c:api:sigil\fP +Specify the marker (\(dqsigil\(dq) that is used for argument placeholders in the +API primitives. The default is \fB@@\fP\&. A placeholder starts with sigil +followed by the argument name in curly braces. For example, if sigil is set +to \fB$\fP, then placeholders will have the form \fB${name}\fP\&. Single\-argument +APIs may use shorthand notation without the name in braces. This option can +be overridden by options for individual API primitives, e.g. +\fBre2c:YYFILL@len\fP for \fBYYFILL\fP\&. +.TP +.B \fBre2c:api:style\fP +Specify API style. Possible values are \fBfunctions\fP (the default for C) and +\fBfree\-form\fP (the default for Go and Rust). +In \fBfunctions\fP style API primitives are generated with an argument list in +parentheses following the name of the primitive. The arguments are provided +only for autogenerated parameters (such as the number of characters passed +to \fBYYFILL\fP), but not for the general lexer context, so the primitives +behave more like macros in C/C++ or closures in Go and Rust. +In free\-form style API primitives do not have a fixed form: they should be +defined as strings containing free\-form pieces of code with interpolated +variables of the form \fB@@{var}\fP or \fB@@\fP (they correspond to arguments in +function\-like style). +This configuration may be overridden for individual API primitives, see for +example \fBre2c:YYFILL:naked\fP configuration for \fBYYFILL\fP\&. +.TP +.B \fBre2c:bit\-vectors\fP, \fBre2c:flags:bit\-vectors\fP, \fBre2c:flags:b\fP +Same as the \fB\-\-bit\-vectors\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:captures\fP, \fBre2c:leftmost\-captures\fP +Same as the \fB\-\-leftmost\-captures\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:captvars\fP, \fBre2c:leftmost\-captvars\fP +Same as the \fB\-\-leftmost\-captvars\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-insensitive\fP, \fBre2c:flags:case\-insensitive\fP +Same as the \fB\-\-case\-insensitive\fP option, but can be configured on +per\-block basis. +.TP +.B \fBre2c:case\-inverted\fP, \fBre2c:flags:case\-inverted\fP +Same as the \fB\-\-case\-inverted\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:case\-ranges\fP, \fBre2c:flags:case\-ranges\fP +Same as the \fB\-\-case\-ranges\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos\fP, \fBre2c:flags:computed\-gotos\fP, \fBre2c:flags:g\fP +Same as the \fB\-\-computed\-gotos\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:computed\-gotos:threshold\fP, \fBre2c:cgoto:threshold\fP +If computed \fBgoto\fP is used, this configuration specifies the complexity +threshold that triggers the generation of jump tables instead of nested +\fBif\fP statements and bitmaps. The default value is \fB9\fP\&. +.TP +.B \fBre2c:cond:abort\fP +If set to a positive integer value, the default case in the generated +condition dispatch aborts program execution. +.TP +.B \fBre2c:cond:goto\fP +Specifies a piece of code used for the autogenerated shortcut rules \fB:=>\fP +in conditions. The default is \fBgoto @@;\fP\&. +The \fB@@\fP placeholder is substituted with condition name (see +configurations \fBre2c:api:sigil\fP and \fBre2c:cond:goto@cond\fP). +.TP +.B \fBre2c:cond:goto@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:goto\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:divider\fP +Defines the divider for condition blocks. +The default value is \fB/* *********************************** */\fP\&. +Placeholders are substituted with condition name (see \fBre2c:api;sigil\fP and +\fBre2c:cond:divider@cond\fP). +.TP +.B \fBre2c:cond:divider@cond\fP +Specifies the sigil used for argument substitution in \fBre2c:cond:divider\fP +definition. The default is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:cond:prefix\fP, \fBre2c:condprefix\fP +Specifies the prefix used for condition labels. +The default is \fByyc_\fP\&. +.TP +.B \fBre2c:cond:enumprefix\fP, \fBre2c:condenumprefix\fP +Specifies the prefix used for condition identifiers. +The default is \fByyc\fP\&. +.TP +.B \fBre2c:debug\-output\fP, \fBre2c:flags:debug\-output\fP, \fBre2c:flags:d\fP +Same as the \fB\-\-debug\-output\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:empty\-class\fP, \fBre2c:flags:empty\-class\fP +Same as the \fB\-\-empty\-class\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:encoding:ebcdic\fP, \fBre2c:flags:ecb\fP, \fBre2c:flags:e\fP +Same as the \fB\-\-ebcdic\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:ucs2\fP, \fBre2c:flags:wide\-chars\fP, \fBre2c:flags:w\fP +Same as the \fB\-\-ucs2\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf8\fP, \fBre2c:flags:utf\-8\fP, \fBre2c:flags:8\fP +Same as the \fB\-\-utf8\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf16\fP, \fBre2c:flags:utf\-16\fP, \fBre2c:flags:x\fP +Same as the \fB\-\-utf16\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding:utf32\fP, \fBre2c:flags:unicode\fP, \fBre2c:flags:u\fP +Same as the \fB\-\-utf32\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:encoding\-policy\fP, \fBre2c:flags:encoding\-policy\fP +Same as the \fB\-\-encoding\-policy\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:eof\fP +Specifies the sentinel symbol used with the end\-of\-input rule \fB$\fP\&. The +default value is \fB\-1\fP (\fB$\fP rule is not used). Other possible values +include all valid code units. Only decimal numbers are recognized. +.TP +.B \fBre2c:fn:sep\fP +Specifies separator used in \fBYYFN\fP elements (defaults to semicolon). +.TP +.B \fBre2c:header\fP, \fBre2c:flags:type\-header\fP, \fBre2c:flags:t\fP +Specifies the name of the generated header file relative to the directory of +the output file. Same as the \fB\-\-header\fP option except that the file path +is relative. +.TP +.B \fBre2c:indent:string\fP +Specifies the string used for indentation. The default is a single tab +character \fB\(dq\et\(dq\fP\&. Indent string should contain whitespace characters only. +To disable indentation entirely, set this configuration to an empty string. +.TP +.B \fBre2c:indent:top\fP +Specifies the minimum amount of indentation to use. The default value is +zero. The value should be a non\-negative integer number. +.TP +.B \fBre2c:invert\-captures\fP +Same as the \fB\-\-invert\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:label:prefix\fP, \fBre2c:labelprefix\fP +Specifies the prefix used for DFA state labels. The default is \fByy\fP\&. +.TP +.B \fBre2c:label:start\fP, \fBre2c:startlabel\fP +Controls the generation of a block start label. The default value is zero, +which means that the start label is generated only if it is used. An integer +value greater than zero forces the generation of start label even if it is +unused by the lexer. A string value also forces start label generation and +sets the label name to the specified string. This configuration applies only +to the current block (it is reset to default for the next block). +.TP +.B \fBre2c:label:yyFillLabel\fP +Specifies the prefix of \fBYYFILL\fP labels used with \fBre2c:eof\fP and in +storable state mode. +.TP +.B \fBre2c:label:yyloop\fP +Specifies the name of the label marking the start of the lexer loop with +\fB\-\-loop\-switch\fP option. The default is \fByyloop\fP\&. +.TP +.B \fBre2c:label:yyNext\fP +Specifies the name of the optional label that follows \fBYYGETSTATE\fP switch +in storable state mode (enabled with \fBre2c:state:nextlabel\fP). The default +is \fByyNext\fP\&. +.TP +.B \fBre2c:lookahead\fP, \fBre2c:flags:lookahead\fP +Deprecated (see the deprecated \fB\-\-no\-lookahead\fP option). +.TP +.B \fBre2c:monadic\fP +If set to non\-zero, the generated lexer will use monadic notation (this +configuration is specific to Haskell). +.TP +.B \fBre2c:nested\-ifs\fP, \fBre2c:flags:nested\-ifs\fP, \fBre2c:flags:s\fP +Same as the \fB\-\-nested\-ifs\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captures\fP, \fBre2c:flags:posix\-captures\fP, \fBre2c:flags:P\fP +Same as the \fB\-\-posix\-captures\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:posix\-captvars\fP +Same as the \fB\-\-posix\-captvars\fP option, but can be configured on per\-block +basis. +.TP +.B \fBre2c:tags\fP, \fBre2c:flags:tags\fP, \fBre2c:flags:T\fP +Same as the \fB\-\-tags\fP option, but can be configured on per\-block basis. +.TP +.B \fBre2c:tags:expression\fP +Specifies the expression used for tag variables. +By default re2js generates expressions of the form \fByyt\fP\&. This might +be inconvenient, for example if tag variables are defined as fields in a +struct. All occurrences of \fB@@{tag}\fP or \fB@@\fP are replaced with the +actual tag name. For example, \fBre2c:tags:expression = \(dqs.@@\(dq;\fP results +in expressions of the form \fBs.yyt\fP in the generated code. +See also \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:tags:negative\fP +Specifies the constant expression that is used for negative tag value +(typically this would be \fB\-1\fP if tags are integer offsets in the input +string, or null pointer if they are pointers). +.TP +.B \fBre2c:tags:prefix\fP +Specifies the prefix for tag variable names. The default is \fByyt\fP\&. +.TP +.B \fBre2c:sentinel\fP +Specifies the sentinel symbol used for the end\-of\-input checks (when bounds +checks are disabled with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP is not +set). This configuration does not affect code generation: its purpose is to +verify that the sentinel is not allowed in the middle of a rule, and ensure +that the lexer won\(aqt read past the end of buffer. The default value is +\fI\-1\(ga\fP (in that case re2js assumes that the sentinel is zero, which is the +most common case). Only decimal numbers are recognized. +.TP +.B \fBre2c:state:abort\fP +If set to a positive integer value, the default case in the generated +state dispatch aborts program execution, and an explicit \fB\-1\fP case +contains transition to the start of the block. +.TP +.B \fBre2c:state:nextlabel\fP +Controls if the \fBYYGETSTATE\fP switch is followed by an \fByyNext\fP label +(the default value is zero, which corresponds to no label). +Alternatively one can use \fBre2c:label:start\fP to generate a specific start +label, or an explicit \fBgetstate\fP block to generate the \fBYYGETSTATE\fP +switch separately from the lexer block. +.TP +.B \fBre2c:unsafe\fP, \fBre2c:flags:unsafe\fP +Same as the \fB\-\-no\-unsafe\fP option, but can be configured on per\-block +basis. +If set to zero, it suppresses the generation of \fBunsafe\fP wrappers around +\fBYYPEEK\fP\&. The default is non\-zero (wrappers are generated). +This configuration is specific to Rust. +.TP +.B \fBre2c:YYBACKUP\fP, \fBre2c:define:YYBACKUP\fP +Defines generic API primitive \fBYYBACKUP\fP\&. +.TP +.B \fBre2c:YYBACKUPCTX\fP, \fBre2c:define:YYBACKUPCTX\fP +Defines generic API primitive \fBYYBACKUPCTX\fP\&. +.TP +.B \fBre2c:YYCONDTYPE\fP, \fBre2c:define:YYCONDTYPE\fP +Defines API primitive \fBYYCONDTYPE\fP\&. +.TP +.B \fBre2c:YYCTYPE\fP, \fBre2c:define:YYCTYPE\fP +Defines API primitive \fBYYCTYPE\fP\&. +.TP +.B \fBre2c:YYCTXMARKER\fP, \fBre2c:define:YYCTXMARKER\fP +Defines API primitive \fBYYCTXMARKER\fP\&. +.TP +.B \fBre2c:YYCURSOR\fP, \fBre2c:define:YYCURSOR\fP +Defines API primitive \fBYYCURSOR\fP\&. +.TP +.B \fBre2c:YYDEBUG\fP, \fBre2c:define:YYDEBUG\fP +Defines API primitive \fBYYDEBUG\fP\&. +.TP +.B \fBre2c:YYFILL\fP, \fBre2c:define:YYFILL\fP +Defines API primitive \fBYYFILL\fP\&. +.TP +.B \fBre2c:YYFILL@len\fP, \fBre2c:define:YYFILL@len\fP +Specifies the sigil used for argument substitution in \fBYYFILL\fP +definition. Defaults to \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:YYFILL:naked\fP, \fBre2c:define:YYFILL:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for \fBYYFILL\fP\&. +Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:YYFN\fP +Defines API primitive \fBYYFN\fP\&. +.TP +.B \fBre2c:YYINPUT\fP +Defines API primitive \fBYYINPUT\fP\&. +.TP +.B \fBre2c:YYGETCOND\fP, \fBre2c:define:YYGETCONDITION\fP +Defines API primitive \fBYYGETCOND\fP\&. +.TP +.B \fBre2c:YYGETCOND:naked\fP, \fBre2c:define:YYGETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETCOND\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:YYGETSTATE\fP, \fBre2c:define:YYGETSTATE\fP +Defines API primitive \fBYYGETSTATE\fP\&. +.TP +.B \fBre2c:YYGETSTATE:naked\fP, \fBre2c:define:YYGETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYGETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:YYGETACCEPT\fP, \fBre2c:define:YYGETACCEPT\fP +Defines API primitive \fBYYGETACCEPT\fP\&. +.TP +.B \fBre2c:YYLESSTHAN\fP, \fBre2c:define:YYLESSTHAN\fP +Defines generic API primitive \fBYYLESSTHAN\fP\&. +.TP +.B \fBre2c:YYLIMIT\fP, \fBre2c:define:YYLIMIT\fP +Defines API primitive \fBYYLIMIT\fP\&. +.TP +.B \fBre2c:YYMARKER\fP, \fBre2c:define:YYMARKER\fP +Defines API primitive \fBYYMARKER\fP\&. +.TP +.B \fBre2c:YYMTAGN\fP, \fBre2c:define:YYMTAGN\fP +Defines generic API primitive \fBYYMTAGN\fP\&. +.TP +.B \fBre2c:YYMTAGP\fP, \fBre2c:define:YYMTAGP\fP +Defines generic API primitive \fBYYMTAGP\fP\&. +.TP +.B \fBre2c:YYPEEK\fP, \fBre2c:define:YYPEEK\fP +Defines generic API primitive \fBYYPEEK\fP\&. +.TP +.B \fBre2c:YYRESTORE\fP, \fBre2c:define:YYRESTORE\fP +Defines generic API primitive \fBYYRESTORE\fP\&. +.TP +.B \fBre2c:YYRESTORECTX\fP, \fBre2c:define:YYRESTORECTX\fP +Defines generic API primitive \fBYYRESTORECTX\fP\&. +.TP +.B \fBre2c:YYRESTORETAG\fP, \fBre2c:define:YYRESTORETAG\fP +Defines generic API primitive \fBYYRESTORETAG\fP\&. +.TP +.B \fBre2c:YYSETCOND\fP, \fBre2c:define:YYSETCONDITION\fP +Defines API primitive \fBYYSETCOND\fP\&. +.TP +.B \fBre2c:YYSETCOND@cond\fP, \fBre2c:define:YYSETCONDITION@cond\fP +Specifies the sigil used for argument substitution in \fBYYSETCOND\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:YYSETCOND:naked\fP, \fBre2c:define:YYSETCONDITION:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETCOND\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:YYSETSTATE\fP, \fBre2c:define:YYSETSTATE\fP +Defines API primitive \fBYYSETSTATE\fP\&. +.TP +.B \fBre2c:YYSETSTATE@state\fP, \fBre2c:define:YYSETSTATE@state\fP +Specifies the sigil used for argument substitution in \fBYYSETSTATE\fP +definition. The default value is \fB@@\fP\&. +Overrides the more generic \fBre2c:api:sigil\fP configuration. +.TP +.B \fBre2c:YYSETSTATE:naked\fP, \fBre2c:define:YYSETSTATE:naked\fP +Overrides the more generic \fBre2c:api:style\fP configuration for +\fBYYSETSTATE\fP\&. Zero value corresponds to free\-form API style. +.TP +.B \fBre2c:YYSETACCEPT\fP, \fBre2c:define:YYSETACCEPT\fP +Defines API primitive \fBYYSETACCEPT\fP\&. +.TP +.B \fBre2c:YYSKIP\fP, \fBre2c:define:YYSKIP\fP +Defines generic API primitive \fBYYSKIP\fP\&. +.TP +.B \fBre2c:YYSHIFT\fP, \fBre2c:define:YYSHIFT\fP +Defines generic API primitive \fBYYSHIFT\fP\&. +.TP +.B \fBre2c:YYCOPYMTAG\fP, \fBre2c:define:YYCOPYMTAG\fP +Defines generic API primitive \fBYYCOPYMTAG\fP\&. +.TP +.B \fBre2c:YYCOPYSTAG\fP, \fBre2c:define:YYCOPYSTAG\fP +Defines generic API primitive \fBYYCOPYSTAG\fP\&. +.TP +.B \fBre2c:YYSHIFTMTAG\fP, \fBre2c:define:YYSHIFTMTAG\fP +Defines generic API primitive \fBYYSHIFTMTAG\fP\&. +.TP +.B \fBre2c:YYSHIFTSTAG\fP, \fBre2c:define:YYSHIFTSTAG\fP +Defines generic API primitive \fBYYSHIFTSTAG\fP\&. +.TP +.B \fBre2c:YYSTAGN\fP, \fBre2c:define:YYSTAGN\fP +Defines generic API primitive \fBYYSTAGN\fP\&. +.TP +.B \fBre2c:YYSTAGP\fP, \fBre2c:define:YYSTAGP\fP +Defines generic API primitive \fBYYSTAGP\fP\&. +.TP +.B \fBre2c:yyaccept\fP, \fBre2c:variable:yyaccept\fP +Defines API primitive \fByyaccept\fP\&. +.TP +.B \fBre2c:yybm\fP, \fBre2c:variable:yybm\fP +Defines API primitive \fByybm\fP\&. +.TP +.B \fBre2c:yybm:hex\fP, \fBre2c:variable:yybm:hex\fP +If set to nonzero, bitmaps for the \fB\-\-bit\-vectors\fP option are generated +in hexadecimal format. The default is zero (bitmaps are in decimal format). +.TP +.B \fBre2c:yych\fP, \fBre2c:variable:yych\fP +Defines API primitive \fByych\fP\&. +.TP +.B \fBre2c:yych:emit\fP, \fBre2c:variable:yych:emit\fP +If set to zero, \fByych\fP definition is not generated. +The default is non\-zero. +.TP +.B \fBre2c:yych:conversion\fP, \fBre2c:variable:yych:conversion\fP +If set to non\-zero, re2js automatically generates a conversion to \fBYYCTYPE\fP +every time \fByych\fP is read. The default is to zero (no conversion). +.TP +.B \fBre2c:yych:literals\fP, \fBre2c:variable:yych:literals\fP +Specifies the form of literals that \fByych\fP is matched against. Possible +values are: \fBchar\fP (character literals in single quotes, non\-printable +ones use escape sequences that start with backslash), \fBhex\fP (hexadecimal +integers) and \fBchar_or_hex\fP (a mixture of both, character literals for +printable characters and hexadecimal integers for others). +.TP +.B \fBre2c:yyctable\fP, \fBre2c:variable:yyctable\fP +Defines API primitive \fByyctable\fP\&. +.TP +.B \fBre2c:yynmatch\fP, \fBre2c:variable:yynmatch\fP +Defines API primitive \fByynmatch\fP\&. +.TP +.B \fBre2c:yypmatch\fP, \fBre2c:variable:yypmatch\fP +Defines API primitive \fByypmatch\fP\&. +.TP +.B \fBre2c:yytarget\fP, \fBre2c:variable:yytarget\fP +Defines API primitive \fByytarget\fP\&. +.TP +.B \fBre2c:yystable\fP, \fBre2c:variable:yystable\fP +Deprecated. +.TP +.B \fBre2c:yystate\fP, \fBre2c:variable:yystate\fP +Defines API primitive \fByystate\fP\&. +.TP +.B \fBre2c:yyfill\fP, \fBre2c:variable:yyfill\fP +Defines API primitive \fByyfill\fP\&. +.TP +.B \fBre2c:yyfill:check\fP +If set to zero, suppresses the generation of pre\-\fBYYFILL\fP check for the +number of input characters (the \fBYYLESSTHAN\fP definition in generic API and +the \fBYYLIMIT\fP\-based comparison in C pointer API). The default is non\-zero +(generate the check). +.TP +.B \fBre2c:yyfill:enable\fP +If set to zero, suppresses the generation of \fBYYFILL\fP (together +with the check). This should be used when the whole input fits into one piece +of memory (there is no need for buffering) and the end\-of\-input checks do not +rely on the \fBYYFILL\fP checks (e.g. if a sentinel character is used). +Use warnings (\fB\-W\fP option) and \fBre2c:sentinel\fP configuration to verify +that the generated lexer cannot read past the end of input. +The default is non\-zero (\fBYYFILL\fP is enabled). +.TP +.B \fBre2c:yyfill:parameter\fP +If set to zero, suppresses the generation of parameter passed to \fBYYFILL\fP\&. +The parameter is the minimum number of characters that must be supplied. +Defaults to non\-zero (the parameter is generated). +This configuration can be overridden with \fBre2c:YYFILL:naked\fP or +\fBre2c:api:style\fP\&. +.UNINDENT +.SS Program interface +.sp +The generated code interfaces with the outer program with the help of +\fIprimitives\fP, collectively referred to as the \fIAPI\fP\&. +Which primitives should be defined for a particular program depends on multiple +factors, including the complexity of regular expressions, input representation, +buffering and the use of various features. All the necessary primitives should +be defined by the user in the form of macros, functions, variables or any other +suitable form that makes the generated code syntactically and semantically +correct. re2js does not (and cannot) check the definitions, so if anything is +missing or defined incorrectly, the generated program may have compile\-time or +run\-time errors. +This manual provides examples of API definitions in the most common cases. +.sp +re2js has three API flavors that define the core set of primitives used by a +program: +.INDENT 0.0 +.TP +.B \fBSimple API\fP +This is the default API for the JavaScript backend. It consists of the +following primitives: \fBYYINPUT\fP (which should be defined as a sequence of +code units, e.g. a string) and \fBYYCURSOR\fP, \fBYYMARKER\fP, \fBYYCTXMARKER\fP, +\fBYYLIMIT\fP (which should be defined as indices in \fBYYINPUT\fP). +.nf + +.fi +.sp +.TP +.B \fBRecord API\fP +Record API is useful in cases when lexer state must be stored in an object. +It is enabled with \fB\-\-api record\fP option or \fBre2c:api = record\fP +configuration. This API consists of a variable \fByyrecord\fP (the +name can be overridden with \fBre2c:yyrecord\fP) that should be defined as an +object with properties \fByyinput\fP, \fByycursor\fP, \fByymarker\fP, +\fByyctxmarker\fP, \fByylimit\fP (only the fields used by the generated code +need to be defined, and their names can be configured). +.nf + +.fi +.sp +.TP +.B \fBGeneric API\fP +This is the most flexible API. It is enabled with \fB\-\-api generic\fP option +or \fBre2c:api = generic\fP configuration. +It contains primitives for generic operations: +\fBYYPEEK\fP, +\fBYYSKIP\fP, +\fBYYBACKUP\fP, +\fBYYBACKUPCTX\fP, +\fBYYSTAGP\fP, +\fBYYSTAGN\fP, +\fBYYMTAGP\fP, +\fBYYMTAGN\fP, +\fBYYRESTORE\fP, +\fBYYRESTORECTX\fP, +\fBYYRESTORETAG\fP, +\fBYYSHIFT\fP, +\fBYYSHIFTSTAG\fP, +\fBYYSHIFTMTAG\fP, +\fBYYLESSTHAN\fP\&. +.UNINDENT +.sp +Here is a full list of API primitives that may be used by the generated code in +order to interface with the outer program. +.INDENT 0.0 +.TP +.B \fBYYCTYPE\fP +The type of the input characters (code units). +For ASCII, EBCDIC and UTF\-8 encodings it should be 1\-byte unsigned integer. +For UTF\-16 or UCS\-2 it should be 2\-byte unsigned integer. For UTF\-32 it +should be 4\-byte unsigned integer. +.TP +.B \fBYYCURSOR\fP +An l\-value that stores the current input position (a pointer or an integer +offset in \fBYYINPUT\fP). Initially \fBYYCURSOR\fP should point to the first +input character, and later it is advanced by the generated code. When a rule +matches, \fBYYCURSOR\fP position is the one after the last matched character. +.TP +.B \fBYYLIMIT\fP +An r\-value that stores the end of input position (a pointer or an integer +offset in \fBYYINPUT\fP). Initially \fBYYLIMIT\fP should point to the position +after the last available input character. It is not changed by the +generated code. The lexer compares \fBYYCURSOR\fP to \fBYYLIMIT\fP +in order to determine if there are enough input characters left. +.TP +.B \fBYYMARKER\fP +An l\-value that stores the position of the latest matched rule (a pointer or +an integer offset in \fBYYINPUT\fP). It is used to restore the \fBYYCURSOR\fP +position if the longer match fails and the lexer needs to rollback. +Initialization is not needed. +.TP +.B \fBYYCTXMARKER\fP +An l\-value that stores the position of the trailing context (a pointer or an +integer offset in \fBYYINPUT\fP). No initialization is needed. \fBYYCTXMARKER\fP +is needed only if the lookahead operator \fB/\fP is used. +.TP +.B \fBYYFILL\fP +A generic API primitive with one variable \fBlen\fP\&. +\fBYYFILL\fP should provide at least \fBlen\fP more input characters or fail. +If \fBre2c:eof\fP is used, then \fBlen\fP is always \fB1\fP and \fBYYFILL\fP should +always return to the calling function; zero return value indicates success. +If \fBre2c:eof\fP is not used, then \fBYYFILL\fP return value is ignored and it +should not return on failure. The maximum value of \fBlen\fP is \fBYYMAXFILL\fP\&. +.TP +.B \fBYYFN\fP +A primitive that defines function prototype in \fB\-\-recursive\-functions\fP +code model. Its value should be an array of one or more strings, where each +string contains two or three components separated by the string specified in +\fBre2c:fn:sep\fP configuration (typically a semicolon). The first array +element defines function name and return type (empty for a void function). +Subsequent elements define function arguments: first, the expression for the +argument used in function body (usually just a name); second, argument type; +third, an optional formal parameter (it defaults to the first component \- +usually both the argument and the parameter are the same identifier). +.TP +.B \fBYYINPUT\fP +An r\-value that stores the current input character sequence (string, buffer, +etc.). +.TP +.B \fBYYMAXFILL\fP +An integral constant equal to the maximum value of the argument to +\fBYYFILL\fP\&. It can be generated with a \fBmax\fP block. +.TP +.B \fBYYLESSTHAN\fP +A generic API primitive with one variable \fBlen\fP\&. +It should be defined as an r\-value of boolean type that equals \fBtrue\fP if +and only if there are less than \fBlen\fP input characters left. +.TP +.B \fBYYPEEK\fP +A generic API primitive with no variables. +It should be defined as an r\-value of type \fBYYCTYPE\fP that is equal to the +character at the current input position. +.TP +.B \fBYYSKIP\fP +A generic API primitive that should advance the current input position by +one code unit. +.TP +.B \fBYYBACKUP\fP +A generic API primitive that should save the current input position (to be +restored with \fBYYRESTORE\fP later). +.TP +.B \fBYYRESTORE\fP +A generic API primitive that should restore the current input position to +the value saved by \fBYYBACKUP\fP\&. +.TP +.B \fBYYBACKUPCTX\fP +A generic API primitive that should save the current input position as the +position of the trailing context (to be restored with \fBYYRESTORECTX\fP +later). +.TP +.B \fBYYRESTORECTX\fP +A generic API primitive that should restore the trailing context position +saved with \fBYYBACKUPCTX\fP\&. +.TP +.B \fBYYRESTORETAG\fP +A generic API primitive with one variable \fBtag\fP that should restore the +trailing context position to the value of \fBtag\fP\&. +.TP +.B \fBYYSTAGP\fP +A generic API primitive with one variable \fBtag\fP, where \fBtag\fP can be a +pointer or an offset in \fBYYINPUT\fP (see submatch extraction section for +details). \fBYYSTAGP\fP should set \fBtag\fP to the current input position. +.TP +.B \fBYYSTAGN\fP +A generic API primitive with one variable \fBtag\fP, where \fBtag\fP can be a +pointer or an offset in \fBYYINPUT\fP (see submatch extraction section for +details). \fBYYSTAGN\fP should to set \fBtag\fP to a value that represents +non\-existent input position. +.TP +.B \fBYYMTAGP\fP +A generic API primitive with one variable \fBtag\fP\&. +\fBYYMTAGP\fP should append the current position to the submatch history of +\fBtag\fP (see the submatch extraction section for details.) +.TP +.B \fBYYMTAGN\fP +A generic API primitive with one variable \fBtag\fP\&. +\fBYYMTAGN\fP should append a value that represents non\-existent input +position position to the submatch history of \fBtag\fP (see the submatch +extraction section for details.) +.TP +.B \fBYYSHIFT\fP +A generic API primitive with one variable \fBshift\fP that should shift the +current input position by \fBshift\fP characters (the shift value may be +negative). +.TP +.B \fBYYCOPYSTAG\fP +A generic API primitive with two variables, \fBlhs\fP and \fBrhs\fP that should +copy right\-hand\-side s\-tag variable \fBrhs\fP to the left\-hand\-side s\-tag +variable \fBlhs\fP\&. For most languages this primitive has a default definition +that assigns \fBlhs\fP to \fBrhs\fP\&. +.TP +.B \fBYYCOPYMTAG\fP +A generic API primitive with two variables, \fBlhs\fP and \fBrhs\fP that should +copy right\-hand\-side m\-tag variable \fBrhs\fP to the left\-hand\-side m\-tag +variable \fBlhs\fP\&. For most languages this primitive has a default definition +that assigns \fBlhs\fP to \fBrhs\fP\&. +.TP +.B \fBYYSHIFTSTAG\fP +A generic API primitive with two variables, \fBtag\fP and \fBshift\fP that +should shift \fBtag\fP by \fBshift\fP code units (the shift value may be +negative). +.TP +.B \fBYYSHIFTMTAG\fP +A generic API primitive with two variables, \fBtag\fP and \fBshift\fP that +should shift the latest value in the history of \fBtag\fP by \fBshift\fP code +units (the shift value may be negative). +.TP +.B \fBYYMAXNMATCH\fP +An integral constant equal to the maximal number of POSIX capturing groups +in a rule. It is generated with a \fBmaxnmatch\fP block. +.TP +.B \fBYYCONDTYPE\fP +The type of the condition enum. +It can be generated either with \fBconditions\fP block or \fB\-\-header\fP option. +.TP +.B \fBYYGETACCEPT\fP +A primitive with one variable \fBvar\fP that stores numeric selector of the +accepted rule. For most languages this primitive has a default definition +that reads from \fBvar\fP\&. +.TP +.B \fBYYSETACCEPT\fP +A primitive with two variables: \fBvar\fP (an l\-value that stores numeric +selector of the accepted rule), and \fBval\fP (the value of selector). For +most languages this primitive has a default definition that assigns \fBvar\fP +to \fBval\fP\&. +.TP +.B \fBYYGETCOND\fP +An r\-value of type \fBYYCONDTYPE\fP that is equal to the current condition +identifier. +.TP +.B \fBYYSETCOND\fP +A primitive with one variable \fBcond\fP that should set the current +condition identifier to \fBcond\fP\&. +.TP +.B \fBYYGETSTATE\fP +An r\-value of integer type that is equal to the current lexer state. It +should be initialized to \fB\-1\fP\&. +.TP +.B \fBYYSETSTATE\fP +A primitive with one variable \fBstate\fP that should set the current lexer +state to \fBstate\fP\&. +.TP +.B \fBYYDEBUG\fP +This primitive is generated only with \fB\-d\fP, \fB\-\-debug\-output\fP option. +Its purpose is to add logging to the generated code (typical \fBYYDEBUG\fP +definition is a print statement). \fBYYDEBUG\fP statements are generated in +every state and have two variables: \fBstate\fP (either a DFA state index or +\fB\-1\fP) and \fBsymbol\fP (the current input symbol). +.TP +.B \fByyaccept\fP +An l\-value of unsigned integral type that stores the number of the latest +matched rule. User definition is necessary only with \fB\-\-storable\-state\fP +option. +.TP +.B \fByybm\fP +A table containing compressed bitmaps for up to 8 transitions (used with +the \fB\-\-bitmaps\fP option). The table contains 256 elements and is indexed by +1\-byte code units. Each 8\-bit element combines boolean values for up to 8 +transitions. k\-Th bit of n\-th element is true iff n\-th code unit is in the +range of k\-th transition. The idea of this bitmap is to replace many \fIif\fP +branches or \fIswitch\fP cases with one check of a single bit in the table. +.TP +.B \fByych\fP +An l\-value of type \fBYYCTYPE\fP that stores the current input character. +User definition is necessary only with \fB\-f\fP \fB\-\-storable\-state\fP option. +.TP +.B \fByyctable\fP +Jump table generated for the initial condition dispatch (enabled with the +combination of \fB\-\-conditions\fP and \fB\-\-computed\-gotos\fP options). +.TP +.B \fByyfill\fP +An l\-value that stores the result of \fBYYFILL\fP call (this may be necessary +for pure functional languages, where \fBYYFILL\fP is a monadic function with +complex return value). +.TP +.B \fByynmatch\fP +An l\-value of unsigned integral type that stores the number of POSIX +capturing groups in the matched rule. +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByypmatch\fP +An array of l\-values that are used to hold the tag values corresponding +to the capturing parentheses in the matching rule. Array length must be +at least \fByynmatch * 2\fP (usually \fBYYMAXNMATCH * 2\fP is a good choice). +Used only with \fB\-P\fP \fB\-\-posix\-captures\fP option. +.TP +.B \fByystable\fP +Deprecated. +.TP +.B \fByystate\fP +An l\-value used with the \fB\-\-loop\-switch\fP option to store the current DFA +state. +.TP +.B \fByytarget\fP +Jump table that contains jump targets (label addresses) for all transitions +from a state. This table is local to each state. Generation of \fByytarget\fP +tables is enabled with \fB\-\-computed\-gotos\fP option. +.UNINDENT +.SS Options +.sp +Some of the options have corresponding \fI\%configurations\fP, +others are global and cannot be changed after re2c starts reading the input file. +Debug options generally require building re2c in debug configuration. +Internal options are useful for experimenting with the algorithms used in re2c. +.INDENT 0.0 +.TP +.B \fB\-? \-\-help \-h\fP +Show help message. +.TP +.B \fB\-\-api \fP +Specify the API used by the generated code to interface with used\-defined +code. Option \fBsimple\fP shold be used in simple cases when there\(aqs no need +for buffer refilling and storing lexer state. Option \fBrecord\fP should be +used when lexer state needs to be stored in a record (struct, class, etc.). +Option \fBgeneric\fP should be used in complex cases when the other two APIs +are not flexible enough. +.TP +.B \fB\-\-bit\-vectors \-b\fP +Optimize conditional jumps using bit masks. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-captures\fP, \fB\-\-leftmost\-captures\fP +Enable submatch extraction with leftmost greedy capturing groups. The result +is collected into an array \fByybmatch\fP of capacity \fB2 * YYMAXNMATCH\fP, and +\fByynmatch\fP is set to the number of groups for the matching rule. +.TP +.B \fB\-\-captvars\fP, \fB\-\-leftmost\-captvars\fP +Enable submatch extraction with leftmost greedy capturing groups. The result +is collected into variables \fByytl\fP, \fByytr\fP for \fBk\fP\-th capturing +group. +.TP +.B \fB\-\-case\-insensitive\fP +Treat single\-quoted and double\-quoted strings as case\-insensitive. +.TP +.B \fB\-\-case\-inverted\fP +Invert the meaning of single\-quoted and double\-quoted strings: +treat single\-quoted strings as case\-sensitive and double\-quoted strings +as case\-insensitive. +.TP +.B \fB\-\-case\-ranges\fP +Collapse consecutive cases in a switch statements into a range of the form +\fBlow ... high\fP\&. This syntax is a C/C++ language extension that is +supported by compilers like GCC, Clang and Tcc. The main advantage over +using single cases is smaller generated code and faster generation time, +although for some compilers like Tcc it also results in smaller binary size. +This option is supported only for C. +.TP +.B \fB\-\-computed\-gotos \-g\fP +Optimize conditional jumps using non\-standard \(dqcomputed goto\(dq extension +(which must be supported by the compiler). re2js generates jump tables +only in complex cases with a lot of conditional branches. Complexity +threshold can be configured with \fBcgoto:threshold\fP configuration. This +option implies \fB\-\-bit\-vectors\fP\&. It is supported only for C. +.TP +.B \fB\-\-conditions \-\-start\-conditions \-c\fP +Enable support of Flex\-like \(dqconditions\(dq: multiple interrelated lexers +within one block. This is an alternative to manually specifying different +re2js blocks connected with \fBgoto\fP or function calls. +.TP +.B \fB\-\-depfile FILE\fP +Write dependency information to \fBFILE\fP in the form of a Makefile rule +\fB : [include\-file ...]\fP\&. This allows one to +track build dependencies in the presence of \fBinclude\fP blocks/directives, +so that updating include files triggers regeneration of the output file. +This option depends on the \fB\-\-output\fP option. +.TP +.B \fB\-\-ebcdic \-\-ecb \-e\fP +Generate a lexer that reads input in EBCDIC encoding. re2js assumes that +the character range is 0 \-\- 0xFF and character size is 1 byte. +.TP +.B \fB\-\-empty\-class \fP +Define the way re2js treats empty character classes. With \fBmatch\-empty\fP +(the default) empty class matches empty input (which is illogical, but +backwards\-compatible). With \fBmatch\-none\fP empty class always fails to match. +With \fBerror\fP empty class raises a compilation error. +.TP +.B \fB\-\-encoding\-policy \fP +Define the way re2js treats Unicode surrogates. +With \fBfail\fP re2js aborts with an error when a surrogate is encountered. +With \fBsubstitute\fP re2js silently replaces surrogates with the error code +point 0xFFFD. With \fBignore\fP (the default) re2js treats surrogates as +normal code points. The Unicode standard says that standalone surrogates +are invalid, but real\-world libraries and programs behave in different ways. +.TP +.B \fB\-\-flex\-syntax \-F\fP +Partial support for Flex syntax: in this mode named definitions don\(aqt need +the equal sign and the terminating semicolon, and when used they must be +surrounded with curly braces. Names without curly braces are treated as +double\-quoted strings. +.TP +.B \fB\-\-goto\-label\fP +Use \(dqgoto/label\(dq code model: encode DFA in form of labeled code blocks +connected with \fBgoto\fP transitions across blocks. This is only supported +for languages that have a \fBgoto\fP statement. +.TP +.B \fB\-\-header \-\-type\-header \-t HEADER\fP +Generate a \fBHEADER\fP file. The contents of the file can be specified using +special blocks \fBheader:on\fP and \fBheader:off\fP\&. If conditions are used, the +generated header will have a condition enum automatically appended to it +(unless there is an explicit \fBconditions\fP block). +.TP +.B \fB\-I PATH\fP +Add \fBPATH\fP to the list of locations which are used when searching for +include files. This option is useful in combination with \fBinclude\fP block +or directive. re2js looks for \fBFILE\fP in the directory of the parent file +and in the include locations specified with \fB\-I\fP option. +.TP +.B \fB\-\-input \fP +Deprecated alias for \fB\-\-api\fP\&. Option \fBdefault\fP corresponds to \fBsimple\fP +(it is indeed the default for most backends, but not for all). Option +\fBcustom\fP corresponds to \fBgeneric\fP\&. +.TP +.B \fB\-\-input\-encoding \fP +Specify the way re2js parses regular expressions. +With \fBascii\fP (the default) re2js handles input as ASCII\-encoded: any +sequence of code units is a sequence of standalone 1\-byte characters. +With \fButf8\fP re2js handles input as UTF8\-encoded and recognizes multibyte +characters. +.TP +.B \fB\-\-invert\-captures\fP +Invert the meaning of capturing and non\-capturing groups. By default +\fB(...)\fP is capturing and \fB(! ...)\fP is non\-capturing. With this option +\fB(! ...)\fP is capturing and \fB(...)\fP is non\-capturing. +.TP +.B \fB\-\-lang \fP +Specify the target language. Supported languages are C, D, Go, Haskell, +Java, JS, OCaml, Python, Rust, V, Zig (more languages can be added via +user\-defined syntax files, see the \fB\-\-syntax\fP option). Option \fBnone\fP +disables default suntax configs, so that the target language is undefined. +.TP +.B \fB\-\-location\-format \fP +Specify location format in messages. +With \fBgnu\fP locations are printed as \(aqfilename:line:column: ...\(aq. +With \fBmsvc\fP locations are printed as \(aqfilename(line,column) ...\(aq. +The default is \fBgnu\fP\&. +.TP +.B \fB\-\-loop\-switch\fP +Use \(dqloop/switch\(dq code model: encode DFA in form of a loop over a switch +statement, where individual states are switch cases. State is stored in a +variable \fByystate\fP\&. Transitions between states update \fByystate\fP to the +case label of the destination state and continue execution to the head of +the loop. +.TP +.B \fB\-\-nested\-ifs \-s\fP +Use nested \fBif\fP statements instead of \fBswitch\fP statements in conditional +jumps. This usually results in more efficient code with non\-optimizing +compilers. +.TP +.B \fB\-\-no\-debug\-info \-i\fP +Do not output line directives. This may be useful when the generated code is +stored in a version control system (to avoid huge autogenerated diffs on +small changes). +.TP +.B \fB\-\-no\-generation\-date\fP +Suppress date output in the generated file. +.TP +.B \fB\-\-no\-version\fP +Suppress version output in the generated file. +.TP +.B \fB\-\-no\-unsafe\fP +Do not generate \fBunsafe\fP wrapper over \fBYYPEEK\fP (this option is specific +to Rust). For performance reasons \fBYYPEEK\fP should avoid bounds\-checking, +as the lexer already performs end\-of\-input checks in a more efficient way. +The user may choose to provide a safe \fBYYPEEK\fP definition, or a definition +that is unsafe only in release builds, in which case the \fB\-\-no\-unsafe\fP +option helps to avoid warnings about redundant \fBunsafe\fP blocks. +.TP +.B \fB\-\-output \-o OUTPUT\fP +Specify the \fBOUTPUT\fP file. +.TP +.B \fB\-\-posix\-captures\fP, \fB\-P\fP +Enable submatch extraction with POSIX\-style capturing groups. The result +is collected into an array \fByybmatch\fP of capacity \fB2 * YYMAXNMATCH\fP, and +\fByynmatch\fP is set to the number of groups for the matching rule. +.TP +.B \fB\-\-posix\-captvars\fP +Enable submatch extraction with POSIX\-style capturing groups. The result +is collected into variables \fByytl\fP, \fByytr\fP for \fBk\fP\-th capturing +group. +.TP +.B \fB\-\-recursive\-functions\fP +Use code model based on co\-recursive functions, where each DFA state is a +separate function that may call other state\-functions or itself. +.TP +.B \fB\-\-reusable \-r\fP +Deprecated since version 2.2 (reusable blocks are allowed by default now). +.TP +.B \fB\-\-skeleton \-S\fP +Ignore user\-defined interface code and generate a self\-contained \(dqskeleton\(dq +program. Additionally, generate input files with strings derived from the +regular grammar and compressed match results that are used to verify +\(dqskeleton\(dq behavior on all inputs. This option is useful for finding bugs +in optimizations and code generation. This option is supported only for C. +.TP +.B \fB\-\-storable\-state \-f\fP +Generate a lexer which can store its inner state. +This is useful in push\-model lexers which are stopped by an outer program +when there is not enough input, and then resumed when more input becomes +available. In this mode users should additionally define \fBYYGETSTATE\fP +and \fBYYSETSTATE\fP primitives, and variables \fByych\fP, \fByyaccept\fP and +\fBstate\fP should be part of the stored lexer state. +.TP +.B \fB\-\-syntax FILE\fP +Load configurations from the specified \fBFILE\fP and apply them on top of the +default syntax file. Note that \fBFILE\fP can define only a few configurations +(if it\(aqs used to amend the default syntax file), or it can define a whole +new language backend (in the latter case it is recommended to use +\fB\-\-lang none\fP option). +.TP +.B \fB\-\-tags \-T\fP +Enable submatch extraction with tags. +.TP +.B \fB\-\-ucs2 \-\-wide\-chars \-w\fP +Generate a lexer that reads UCS2\-encoded input. re2js assumes that the +character range is 0 \-\- 0xFFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf8 \-\-utf\-8 \-8\fP +Generate a lexer that reads input in UTF\-8 encoding. re2js assumes that the +character range is 0 \-\- 0x10FFFF and character size is 1 byte. +.TP +.B \fB\-\-utf16 \-\-utf\-16 \-x\fP +Generate a lexer that reads UTF16\-encoded input. re2js assumes that the +character range is 0 \-\- 0x10FFFF and character size is 2 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-utf32 \-\-unicode \-u\fP +Generate a lexer that reads UTF32\-encoded input. re2js assumes that the +character range is 0 \-\- 0x10FFFF and character size is 4 bytes. +This option implies \fB\-\-nested\-ifs\fP\&. +.TP +.B \fB\-\-verbose\fP +Output a short message in case of success. +.TP +.B \fB\-\-vernum \-V\fP +Show version information in \fBMMmmpp\fP format (major, minor, patch). +.TP +.B \fB\-\-version \-v\fP +Show version information. +.TP +.B \fB\-\-single\-pass \-1\fP +Deprecated. Does nothing (single pass is the default now). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-debug\-output \-d\fP +Emit \fBYYDEBUG\fP invocations in the generated code. This is useful to trace +lexer execution. +.TP +.B \fB\-\-dump\-adfa\fP +Debug option: output DFA after tunneling (in .dot format). +.TP +.B \fB\-\-dump\-cfg\fP +Debug option: output control flow graph of tag variables (in .dot format). +.TP +.B \fB\-\-dump\-closure\-stats\fP +Debug option: output statistics on the number of states in closure. +.TP +.B \fB\-\-dump\-dfa\-det\fP +Debug option: output DFA immediately after determinization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-min\fP +Debug option: output DFA after minimization (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tagopt\fP +Debug option: output DFA after tag optimizations (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-tree\fP +Debug option: output DFA under construction with states represented as tag +history trees (in .dot format). +.TP +.B \fB\-\-dump\-dfa\-raw\fP +Debug option: output DFA under construction with expanded state\-sets +(in .dot format). +.TP +.B \fB\-\-dump\-interf\fP +Debug option: output interference table produced by liveness analysis of tag +variables. +.TP +.B \fB\-\-dump\-nfa\fP +Debug option: output NFA (in .dot format). +.TP +.B \fB\-\-emit\-dot \-D\fP +Instead of normal output generate lexer graph in .dot format. +The output can be converted to an image with the help of Graphviz +(e.g. something like \fBdot \-Tpng \-odfa.png dfa.dot\fP). +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-\-dfa\-minimization \fP +Internal option: DFA minimization algorithm used by re2js\&. The \fBmoore\fP +option is the Moore algorithm (it is the default). The \fBtable\fP option is +the \(dqtable filling\(dq algorithm. Both algorithms should produce the same DFA +up to states relabeling; table filling is simpler and much slower and serves +as a reference implementation. +.TP +.B \fB\-\-eager\-skip\fP +Internal option: make the generated lexer advance the input position +eagerly \-\- immediately after reading the input symbol. This changes the +default behavior when the input position is advanced lazily \-\- after +transition to the next state. +.TP +.B \fB\-\-no\-lookahead\fP +Internal option, deprecated. +It used to enable TDFA(0) algorithm. Unlike TDFA(1), TDFA(0) algorithm does +not use one\-symbol lookahead. It applies register operations to the incoming +transitions rather than the outgoing ones. Benchmarks showed that TDFA(0) +algorithm is less efficient than TDFA(1). +.TP +.B \fB\-\-no\-optimize\-tags\fP +Internal option: suppress optimization of tag variables (useful for +debugging). +.TP +.B \fB\-\-posix\-closure \fP +Internal option: specify shortest\-path algorithm used for the construction of +epsilon\-closure with POSIX disambiguation semantics: \fBgor1\fP (the default) +stands for Goldberg\-Radzik algorithm, and \fBgtop\fP stands for \(dqglobal +topological order\(dq algorithm. +.TP +.B \fB\-\-posix\-prectable \fP +Internal option: specify the algorithm used to compute POSIX precedence +table. The \fBcomplex\fP algorithm computes precedence table in one traversal +of tag history tree and has quadratic complexity in the number of TNFA +states; it is the default. The \fBnaive\fP algorithm has worst\-case cubic +complexity in the number of TNFA states, but it is much simpler than +\fBcomplex\fP and may be slightly faster in non\-pathological cases. +.TP +.B \fB\-\-stadfa\fP +Internal option, deprecated. +It used to enable staDFA algorithm, which differs from TDFA in that register +operations are placed in states rather than on transitions. Benchmarks +showed that staDFA algorithm is less efficient than TDFA. +.TP +.B \fB\-\-fixed\-tags \fP +Internal option: +specify whether the fixed\-tag optimization should be applied to all tags +(\fBall\fP), none of them (\fBnone\fP), or only those in toplevel concatenation +(\fBtoplevel\fP). The default is \fBall\fP\&. +\(dqFixed\(dq tags are those that are located within a fixed distance to some +other tag (called \(dqbase\(dq). In such cases only the base tag needs to be +tracked, and the value of the fixed tag can be computed as the value of the +base tag plus a static offset. For tags that are under alternative or +repetition it is also necessary to check if the base tag has a no\-match +value (in that case fixed tag should also be set to no\-match, disregarding +the offset). For tags in top\-level concatenation the check is not needed, +because they always match. +.UNINDENT +.SS Warnings +.sp +Warnings can be invividually enabled, disabled and turned into an error. +.INDENT 0.0 +.TP +.B \fB\-W\fP +Turn on all warnings. +.TP +.B \fB\-Werror\fP +Turn warnings into errors. Note that this option alone +doesn\(aqt turn on any warnings; it only affects those warnings that have +been turned on so far or will be turned on later. +.TP +.B \fB\-W\fP +Turn on \fBwarning\fP\&. +.TP +.B \fB\-Wno\-\fP +Turn off \fBwarning\fP\&. +.TP +.B \fB\-Werror\-\fP +Turn on \fBwarning\fP and treat it as an error (this implies \fB\-W\fP). +.TP +.B \fB\-Wno\-error\-\fP +Don\(aqt treat this particular \fBwarning\fP as an error. This doesn\(aqt turn off +the warning itself. +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-Wcondition\-order\fP +Warn if the generated program makes implicit assumptions about condition +numbering. One should use either \fB\-\-header\fP option or \fBconditions\fP +block to generate a mapping of condition names to numbers and then use the +autogenerated condition names. +.TP +.B \fB\-Wempty\-character\-class\fP +Warn if a regular expression contains an empty character class. Trying to +match an empty character class makes no sense: it should always fail. +However, for backwards compatibility reasons re2js permits empty character +classes and treats them as empty strings. Use the \fB\-\-empty\-class\fP option +to change the default behavior. +.TP +.B \fB\-Wmatch\-empty\-string\fP +Warn if a rule is nullable (matches an empty string). +If the lexer runs in a loop and the empty match is unintentional, the lexer +may unexpectedly hang in an infinite loop. +.TP +.B \fB\-Wswapped\-range\fP +Warn if the lower bound of a range is greater than its upper bound. The +default behavior is to silently swap the range bounds. +.TP +.B \fB\-Wundefined\-control\-flow\fP +Warn if some input strings cause undefined control flow in the lexer (the +faulty patterns are reported). This is a dangerous and common mistake. It +can be easily fixed by adding the default rule \fB*\fP which has the lowest +priority, matches any code unit, and always consumes a single code unit. +.TP +.B \fB\-Wunreachable\-rules\fP +Warn about rules that are shadowed by other rules and will never match. +.TP +.B \fB\-Wuseless\-escape\fP +Warn if a symbol is escaped when it shouldn\(aqt be. +By default, re2js silently ignores such escapes, but this may as well +indicate a typo or an error in the escape sequence. +.TP +.B \fB\-Wnondeterministic\-tags\fP +Warn if a tag has \fBn\fP\-th degree of nondeterminism, where \fBn\fP is greater +than 1. +.TP +.B \fB\-Wsentinel\-in\-midrule\fP +Warn if the sentinel symbol occurs in the middle of a rule \-\-\- this may +cause reads past the end of buffer, crashes or memory corruption in the +generated lexer. This warning is only applicable if the sentinel method of +checking for the end of input is used. +It is set to an error if \fBre2c:sentinel\fP configuration is used. +.TP +.B \fB\-Wundefined\-syntax\-config\fP +Warn if the syntax file specified with \fB\-\-syntax\fP option is missing +definitions of some configurations. This helps to maintain user\-defined +syntax files: if a new release adds configurations, old syntax file will +raise a warning, and the user will be notified. If some configurations are +unused and do not need a definition, they should be explicitly set to +\fB\fP\&. +.UNINDENT +.SS Syntax files +.sp +Support for different languages in re2c is based on the idea of \fIsyntax files\fP\&. +A syntax file is a configuration file that defines syntax of the target language +\-\- not the whole language, but a small part of it that is used by the generated +code. Syntax files make re2c very flexible, but they should not be used as a +replacement for \fBre2c:\fP configurations: their purpose is to define syntax of +the target language, not to customize one particular lexer. All supported +languages have default syntax files that are part of the distribution (see +\fBinclude/syntax\fP subdirectory); they are also embedded in the re2js binary. +Users may provide a custom syntax file that overrides a few configurations for +one of supported languages, or they may choose to redefine all configurations +(in that case \fB\-\-lang none\fP option should be used). +Syntax files contain configurations of four different kinds: feature lists, +language configurations, inplace configurations and code templates. +.sp +\fBFeature lists\fP +.INDENT 0.0 +.INDENT 3.5 +A few list configurations define various features supported by a given +backend, so that re2js may give a clear error if the user tries to enable an +unsupported feature: +.INDENT 0.0 +.TP +.B \fBsupported_apis\fP +A list of supported APIs with possible elements \fBsimple\fP, \fBrecord\fP, +\fBgeneric\fP\&. +.TP +.B \fBsupported_api_styles\fP +A list of supported API styles with possible elements \fBfunctions\fP, +\fBfree\-form\fP\&. +.TP +.B \fBsupported_code_models\fP +A list of supported code models with possible elements \fBgoto\-label\fP, +\fBloop\-switch\fP, \fBrecursive\-functions\fP\&. +.TP +.B \fBsupported_targets\fP +A list of supported codegen targets with possible elements \fBcode\fP, +\fBdot\fP, \fBskeleton\fP\&. +.TP +.B \fBsupported_features\fP +A list of supported features with possible elements \fBnested\-ifs\fP, +\fBbitmaps\fP, \fBcomputed\-gotos\fP, \fBcase\-ranges\fP, \fBmonadic\fP, \fBunsafe\fP, +\fBtags\fP, \fBcaptures\fP, \fBcaptvars\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +\fBLanguage configurations\fP +.INDENT 0.0 +.INDENT 3.5 +A few boolean configurations describe features of the target language that +affect re2js parser and code generator: +.INDENT 0.0 +.TP +.B \fBsemicolons\fP +Non\-zero if the language uses semicolons after statements. +.TP +.B \fBbacktick_quoted_strings\fP +Non\-zero if the language has backtick\-quoted strings. +.TP +.B \fBsingle_quoted_strings\fP +Non\-zero if the language has single\-quoted strings. +.TP +.B \fBindentation_sensitive\fP +Non\-zero if the language is indentation sensitive. +.TP +.B \fBwrap_blocks_in_braces\fP +Non\-zero if compound statements must be wrapped in curly braces. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +\fBInplace configurations\fP +.INDENT 0.0 +.INDENT 3.5 +Syntax files define initial values of all \fBre2c:\fP configurations, as they +may differ for different languages. See configurations section for a full list +of all inplace configurations and their meaning. +.UNINDENT +.UNINDENT +.sp +\fBCode templates\fP +.INDENT 0.0 +.INDENT 3.5 +Code templates define syntax of the target language. They are written in a +simple domain\-specific language with the following formal grammar: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +code\-template :: + name \(aq=\(aq code\-exprs \(aq;\(aq + | CODE_TEMPLATE \(aq;\(aq + | \(aq\(aq \(aq;\(aq + +code\-exprs :: + + | code\-exprs code\-expr + +code\-expr :: + STRING + | VARIABLE + | optional + | list + +optional :: + \(aq(\(aq CONDITIONAL \(aq?\(aq code\-exprs \(aq)\(aq + | \(aq(\(aq CONDITIONAL \(aq?\(aq code\-exprs \(aq:\(aq code\-exprs \(aq)\(aq + +list :: + \(aq[\(aq VARIABLE \(aq:\(aq code\-exprs \(aq]\(aq + | \(aq[\(aq VARIABLE \(aq{\(aq NUMBER \(aq}\(aq \(aq:\(aq code\-exprs \(aq]\(aq + | \(aq[\(aq VARIABLE \(aq{\(aq NUMBER \(aq,\(aq NUMBER \(aq}\(aq \(aq:\(aq code\-exprs \(aq]\(aq +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +A code template is a sequence of string literals, variables, optional elements +and lists, or a reference to another code template, or a special value +\fB\fP\&. Variables are placeholders that are substituted during code +generation phase. List variables are special: when expanding list templates, +re2js repeats expressions the right hand side of the column a few times, each +time replacing occurrences of the list variable with a value specific to this +repetition. Lists have optional bounds (negative values are counted from the +end, e.g. \fB\-1\fP means the last element). Conditional names start with a dot. +Both conditionals and variables may be either local (specific to the given +code template) or global (allowed in all code templates). When re2js reads +syntax file, it checks that each code template uses only the variables and +conditionals that are allowed in it. +.sp +For example, the following code template defines if\-then\-else construct for a +C\-like language: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +code:if_then_else = + [branch{0}: topindent \(dqif \(dq cond \(dq {\(dq nl + indent [stmt: stmt] dedent] + [branch{1:\-1}: topindent \(dq} else\(dq (.cond ? \(dq if \(dq cond) \(dq {\(dq nl + indent [stmt: stmt] dedent] + topindent \(dq}\(dq nl; +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here \fBbranch\fP is a list variable: \fBbranch{0}\fP expands to the first branch +(which is special, as there is no \fBelse\fP part), \fBbranch{1:\-1}\fP expands to +all remaining branches (if any). \fBstmt\fP is also a list variable: +\fB[stmt: stmt]\fP is a nested list that expands to a list of statements in the +body of the current branch. \fBtopindent\fP, \fBindent\fP, \fBdedent\fP and \fBnl\fP +are global variables, and \fB\&.cond\fP is a local conditional (their meaning is +described below). This code template could produce the following code: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +if x { + // do something +} else if y { + // do something else +} else { + // don\(aqt do anything +} +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here\(aqs a list of all code templates supported by re2js with their local +variables and conditionals. Note that a particular definition may, but does +not have to use local variables and conditionals. +Any unused code templates should be set to \fB\fP\&. +.INDENT 0.0 +.TP +.B \fBcode:var_local\fP +Declaration or definition of a local variable. Supported variables: +\fBtype\fP (the type of the variable), \fBname\fP (its name) and \fBinit\fP +(initial value, if any). Conditionals: \fB\&.init\fP (true if there is an +initializer). +.TP +.B \fBcode:var_global\fP +Same as \fBcode:var_local\fP, except that it\(aqs used in top\-level. +.TP +.B \fBcode:const_local\fP +Definition of a local constant. Supported variables: \fBtype\fP (the type +of the constant), \fBname\fP (its name) and \fBinit\fP (initial value). +.TP +.B \fBcode:const_global\fP +Same as \fBcode:const_local\fP, except that it\(aqs used in top\-level. +.TP +.B \fBcode:array_local\fP +Definition of a local array (table). Supported variables: \fBtype\fP (the +type of array elements), \fBname\fP (array name), \fBsize\fP (its size), +\fBrow\fP (a list variable that does not itself produce any code, but +expands list expression as many times as there are rows in the table) +and \fBelem\fP (a list variable that expands to all table elements in the +current row \-\- it\(aqs meant to be nested in the \fBrow\fP list). +.TP +.B \fBcode:array_global\fP +Same as \fBcode:array_local\fP, except that it\(aqs used in top\-level. +.TP +.B \fBcode:array_elem\fP +Reference to an element of an array (table). Supported variables: +\fBarray\fP (the name of the array) and \fBindex\fP (index of the element). +.TP +.B \fBcode:enum\fP +Definition of an enumeration (it may be defined using a special language +construct for enumerations, or simply as a few standalone constants). +Supported variables are \fBtype\fP (user\-defined enumeration type or type +of the constants), \fBelem\fP (list variable that expands to the name of +each member) and \fBinit\fP (initializer for each member). Conditionals: +\fB\&.init\fP (true if there is an initializer). +.TP +.B \fBcode:enum_elem\fP +Enumeration element (a member of a user\-defined enumeration type or a +name of a constant, depending on how \fBcode:enum\fP is defined). +Supported variables are \fBname\fP (the name of the element) and \fBtype\fP +(its type). +.TP +.B \fBcode:assign\fP +Assignment statement. Supported variables are \fBlhs\fP (left hand side) +and \fBrhs\fP (right hand side). +.TP +.B \fBcode:type_int\fP +Signed integer type. +.TP +.B \fBcode:type_uint\fP +Unsigned integer type. +.TP +.B \fBcode:type_yybm\fP +Type of elements in the \fByybm\fP table. +.TP +.B \fBcode:type_yytarget\fP +Type of elements in the \fByytarget\fP table. +.TP +.B \fBcode:cmp_eq\fP +Operator \(dqequals\(dq. +.TP +.B \fBcode:cmp_ne\fP +Operator \(dqnot equals\(dq. +.TP +.B \fBcode:cmp_lt\fP +Operator \(dqless than\(dq. +.TP +.B \fBcode:cmp_gt\fP +Operator \(dqgreater than\(dq +.TP +.B \fBcode:cmp_le\fP +Operator \(dqless or equal\(dq +.TP +.B \fBcode:cmp_ge\fP +Operator \(dqgreater or equal\(dq +.TP +.B \fBcode:if_then_else\fP +If\-then\-else statement with one or more branches. Supported variables: +\fBbranch\fP (a list variable that does not itself produce any code, but +expands list expression as many times as there are branches), \fBcond\fP +(condition of the current branch) and \fBstmt\fP (a list variable that +expands to all statements in the current branch). Conditionals: +\fB\&.cond\fP (true if the current branch has a condition), \fB\&.many\fP (true +if there\(aqs more than one branch). +.TP +.B \fBcode:if_then_else_oneline\fP +A specialization of \fBcode:if_then_else\fP for the case when all branches +have one\-line statements. If this is \fB\fP, +\fBcode:if_then_else\fP is used instead. +.TP +.B \fBcode:switch\fP +A switch statement with one or more cases. Supported variables: \fBexpr\fP +(the switched\-on expression) and \fBcase\fP (a list variable that expands +to all cases\-groups with their code blocks). +.TP +.B \fBcode:switch_cases\fP +A group of switch cases that maps to a single code block. Supported +variables are \fBcase\fP (a list variable that expands to all cases in +this group) and \fBstmt\fP (a list variable that expands to all statements +in the code block. +.TP +.B \fBcode:switch_cases_oneline\fP +A specialization of \fBcode:switch_cases\fP for the case when the code +block consists of a single one\-line statement. If this is +\fB\fP, \fBcode:switch_cases\fP is used instead. +.TP +.B \fBcode:switch_case_range\fP +A single switch case that covers a range of values (possibly consisting +of a single value). Supported variable: \fBval\fP (a list variable that +expands to all values in the range). Supported conditionals: \fB\&.many\fP +(true if there\(aqs more than one value in the range) and +\fB\&.char_literals\fP (true if this is a switch on character literals \-\- +some languages provide special syntax for this case). +.TP +.B \fBcode:switch_case_default\fP +Default switch case. +.TP +.B \fBcode:loop\fP +A loop that runs forever (unless interrupted from the loop body). +Supported variables: \fBlabel\fP (loop label), \fBstmt\fP (a list variable +that expands to all statements in the loop body). +.TP +.B \fBcode:continue\fP +Continue statement. Supported variables: \fBlabel\fP (label from which to +continue execution). +.TP +.B \fBcode:goto\fP +Goto statement. Supported variables: \fBlabel\fP (label of the jump +target). +.TP +.B \fBcode:fndecl\fP +Function declaration. Supported variables: \fBname\fP (function name), +\fBtype\fP (return type), \fBarg\fP (a list variable that does not itself +produce code, but expands list expression as many times as there are +function arguments), \fBargname\fP (name of the current argument), +\fBargtype\fP (type of the current argument). Conditional: \fB\&.type\fP (true +if this is a non\-void function). +.TP +.B \fBcode:fndef\fP +Like \fBcode:fndecl\fP, but used for function definitions, so it has one +additional list variable \fBstmt\fP that expands to all statements in the +function body. +.TP +.B \fBcode:fncall\fP +Function call statement. Supported variables: \fBname\fP (function name), +\fBretval\fP (l\-value where the return value is stored, if any) and +\fBarg\fP (a list variable that expands to all function arguments). +Conditionals: \fB\&.args\fP (true if the function has arguments) and +\fB\&.retval\fP (true if return value needs to be saved). +.TP +.B \fBcode:tailcall\fP +Tail call statement. Supported variables: \fBname\fP (function name), +and \fBarg\fP (a list variable that expands to all function arguments). +Conditionals: \fB\&.args\fP (true if the function has arguments) and +\fB\&.retval\fP (true if this is a non\-void function). +.TP +.B \fBcode:recursive_functions\fP +Program body with \fB\-\-recursive\-functions\fP code model. Supported +variables: \fBfn\fP (a list variable that does not itself produce any +code, but expands list expression as many times as there are functions), +\fBfndecl\fP (declaration of the current function) and \fBfndef\fP +(definition of the current function). +.TP +.B \fBcode:fingerprint\fP +The fingerprint at the top of the generated output file. Supported +variables: \fBver\fP (re2js version that was used to generate this) and +\fBdate\fP (generation date). +.TP +.B \fBcode:line_info\fP +The format of line directives (if this is set to \fB\fP, no +directives are generated). Supported variables: \fBline\fP (line number) +and \fBfile\fP (filename). +.TP +.B \fBcode:abort\fP +A statement that aborts program execution. +.TP +.B \fBcode:yydebug\fP +\fBYYDEBUG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYDEBUG\fP, \fByyrecord\fP, \fByych\fP (map to the +corresponding \fBre2c:\fP configurations), \fBstate\fP (DFA state number). +.TP +.B \fBcode:yypeek\fP +\fBYYPEEK\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYPEEK\fP, \fBYYCTYPE\fP, \fBYYINPUT\fP, \fBYYCURSOR\fP, +\fByyrecord\fP, \fByych\fP (map to the corresponding \fBre2c:\fP +configurations). Conditionals: \fB\&.cast\fP (true if +\fBre2c:yych:conversion\fP is set to non\-zero). +.TP +.B \fBcode:yyskip\fP +\fBYYSKIP\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSKIP\fP, \fBYYCURSOR\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations). +.TP +.B \fBcode:yybackup\fP +\fBYYBACKUP\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYBACKUP\fP, \fBYYCURSOR\fP, \fBYYMARKER\fP, +\fByyrecord\fP (map to the corresponding \fBre2c:\fP configurations). +.TP +.B \fBcode:yybackupctx\fP +\fBYYBACKUPCTX\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYBACKUPCTX\fP, \fBYYCURSOR\fP, \fBYYCTXMARKER\fP, +\fByyrecord\fP (map to the corresponding \fBre2c:\fP configurations). +.TP +.B \fBcode:yyskip_yypeek\fP +Combined \fBcode:yyskip\fP and \fBcode:yypeek\fP statement (defaults to +\fBcode:yyskip\fP followed by \fBcode:yypeek\fP). +.TP +.B \fBcode:yypeek_yyskip\fP +Combined \fBcode:yypeek\fP and \fBcode:yyskip\fP statement (defaults to +\fBcode:yypeek\fP followed by \fBcode:yyskip\fP). +.TP +.B \fBcode:yyskip_yybackup\fP +Combined \fBcode:yyskip\fP and \fBcode:yybackup\fP statement (defaults to +\fBcode:yyskip\fP followed by \fBcode:yybackup\fP). +.TP +.B \fBcode:yybackup_yyskip\fP +Combined \fBcode:yybackup\fP and \fBcode:yyskip\fP statement (defaults to +\fBcode:yybackup\fP followed by \fBcode:yyskip\fP). +.TP +.B \fBcode:yybackup_yypeek\fP +Combined \fBcode:yybackup\fP and \fBcode:yypeek\fP statement (defaults to +\fBcode:yybackup\fP followed by \fBcode:yypeek\fP). +.TP +.B \fBcode:yyskip_yybackup_yypeek\fP +Combined \fBcode:yyskip\fP, \fBcode:yybackup\fP and \fBcode:yypeek\fP +statement (defaults to\(ga\(gacode:yyskip\(ga\(ga followed by \fBcode:yybackup\fP +followed by \fBcode:yypeek\fP). +.TP +.B \fBcode:yybackup_yypeek_yyskip\fP +Combined \fBcode:yybackup\fP, \fBcode:yypeek\fP and \fBcode:yyskip\fP +statement (defaults to\(ga\(gacode:yybackup\(ga\(ga followed by \fBcode:yypeek\fP +followed by \fBcode:yyskip\fP). +.TP +.B \fBcode:yyrestore\fP +\fBYYRESTORE\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYRESTORE\fP, \fBYYCURSOR\fP, \fBYYMARKER\fP, +\fByyrecord\fP (map to the corresponding \fBre2c:\fP configurations). +.TP +.B \fBcode:yyrestorectx\fP +\fBYYRESTORECTX\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYRESTORECTX\fP, \fBYYCURSOR\fP, \fBYYCTXMARKER\fP, +\fByyrecord\fP (map to the corresponding \fBre2c:\fP configurations). +.TP +.B \fBcode:yyrestoretag\fP +\fBYYRESTORETAG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYRESTORETAG\fP, \fBYYCURSOR\fP, \fByyrecord\fP (map +to the corresponding \fBre2c:\fP configurations), \fBtag\fP (the name of tag +variable used to restore position). +.TP +.B \fBcode:yyshift\fP +\fBYYSHIFT\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSHIFT\fP, \fBYYCURSOR\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBoffset\fP (the number of code +units to shift the current position). +.TP +.B \fBcode:yyshiftstag\fP +\fBYYSHIFTSTAG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSHIFTSTAG\fP, \fByyrecord\fP, \fBnegative\fP (map +to the corresponding \fBre2c:\fP configurations), \fBtag\fP (tag variable +which needs to be shifted), \fBoffset\fP (the number of code units to +shift). Conditionals: \fB\&.nested\fP (true if this is a nested tag \-\- in +this case its value may equal to \fBre2c:tags:negative\fP, which should +not be shifted). +.TP +.B \fBcode:yyshiftmtag\fP +\fBYYSHIFTMTAG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSHIFTMTAG\fP (maps to the corresponding +\fBre2c:\fP configuration), \fBtag\fP (tag variable which needs to be +shifted), \fBoffset\fP (the number of code units to shift). +.TP +.B \fBcode:yystagp\fP +\fBYYSTAGP\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSTAGP\fP, \fBYYCURSOR\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBtag\fP (tag variable that +should be updated). +.TP +.B \fBcode:yymtagp\fP +\fBYYMTAGP\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYMTAGP\fP (maps to the corresponding \fBre2c:\fP +configuration), \fBtag\fP (tag variable that should be updated). +.TP +.B \fBcode:yystagn\fP +\fBYYSTAGN\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSTAGN\fP, \fBnegative\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBtag\fP (tag variable that +should be updated). +.TP +.B \fBcode:yymtagn\fP +\fBYYMTAGN\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYMTAGN\fP (maps to the corresponding \fBre2c:\fP +configuration), \fBtag\fP (tag variable that should be updated). +.TP +.B \fBcode:yycopystag\fP +\fBYYCOPYSTAG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYCOPYSTAG\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBlhs\fP, \fBrhs\fP (left and +right hand side tag variables of the copy operation). +.TP +.B \fBcode:yycopymtag\fP +\fBYYCOPYMTAG\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYCOPYMTAG\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBlhs\fP, \fBrhs\fP (left and +right hand side tag variables of the copy operation). +.TP +.B \fBcode:yygetaccept\fP +\fBYYGETACCEPT\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYGETACCEPT\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yyaccept\fP configuration). +.TP +.B \fBcode:yysetaccept\fP +\fBYYSETACCEPT\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSETACCEPT\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yyaccept\fP configuration) and \fBval\fP (numeric value of the +accepted rule). +.TP +.B \fBcode:yygetcond\fP +\fBYYGETCOND\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYGETCOND\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yycond\fP configuration). +.TP +.B \fBcode:yysetcond\fP +\fBYYSETCOND\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSETCOND\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yycond\fP configuration) and \fBval\fP (numeric condition +identifier). +.TP +.B \fBcode:yygetstate\fP +\fBYYGETSTATE\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYGETSTATE\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yystate\fP configuration). +.TP +.B \fBcode:yysetstate\fP +\fBYYSETSTATE\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYSETSTATE\fP, \fByyrecord\fP (map to the +corresponding \fBre2c:\fP configurations), \fBvar\fP (maps to +\fBre2c:yystate\fP configuration) and \fBval\fP (state number). +.TP +.B \fBcode:yylessthan\fP +\fBYYLESSTHAN\fP statement, possibly specialized for different APIs. +Supported variables: \fBYYLESSTHAN\fP, \fBYYCURSOR\fP, \fBYYLIMIT\fP, +\fByyrecord\fP (map to the corresponding \fBre2c:\fP configurations), +\fBneed\fP (the number of code units to check against). Conditional: +\fB\&.many\fP (true if the \fBneed\fP is more than one). +.TP +.B \fBcode:yybm_filter\fP +Condition that is used to filter out \fByych\fP values that are not +covered by the \fByybm\fP table (used with \fB\-\-bitmaps\fP option). +Supported variable: \fByych\fP (maps to \fBre2c:yych\fP configuration). +.TP +.B \fBcode:yybm_match\fP +The format of \fByybm\fP table check (generated with \fB\-\-bitmaps\fP +option). Supported variables: \fByybm\fP, \fByych\fP (map to the +corresponding \fBre2c:\fP configurations), \fBoffset\fP (offset in the +\fByybm\fP table that needs to be added to \fByych\fP) and \fBmask\fP (bit +mask that should be applied to the table entry to retrieve the boolean +value that needs to be checked) +.UNINDENT +.sp +Here\(aqs a list of all global variables that are allowed in syntax files: +.INDENT 0.0 +.TP +.B \fBnl\fP +A newline. +.TP +.B \fBindent\fP +A variable that does not produce any code, but has a side\-effect of +increasing indentation level. +.TP +.B \fBdedent\fP +A variable that does not produce any code, but has a side\-effect of +decreasing indentation level. +.TP +.B \fBtopindent\fP +Indentation string for the current statement. Indentation level is +tracked and automatically updated by the code generator. +.UNINDENT +.sp +Here\(aqs a list of all global conditionals that are allowed in syntax files: +.INDENT 0.0 +.TP +.B \fB\&.api.simple\fP +True if simple API is used (\fB\-\-api simple\fP or \fBre2c:api = simple\fP). +.TP +.B \fB\&.api.generic\fP +True if generic API is used (\fB\-\-api generic\fP or +\fBre2c:api = generic\fP). +.TP +.B \fB\&.api.record\fP +True if record API is used (\fB\-\-api record\fP or \fBre2c:api = record\fP). +.TP +.B \fB\&.api_style.functions\fP +True if function\-like API style is used +(\fBre2c:api\-style = functions\fP). +.TP +.B \fB\&.api_style.freeform\fP +True if free\-form API style is used (\fBre2c:api\-style = free\-form\fP). +.TP +.B \fB\&.case_ranges\fP +True if case ranges feature is enabled (\fB\-\-case\-ranges\fP or +\fBre2c:case\-ranges = 1\fP). +.TP +.B \fB\&.code_model.goto_label\fP +True if code model based on goto/label is used (\fB\-\-goto\-label\fP). +.TP +.B \fB\&.code_model.loop_switch\fP +True if code model based on loop/switch is used (\fB\-\-loop\-switch\fP). +.TP +.B \fB\&.code_model.recursive_functions\fP +True if code model based on recursive functions is used +(\fB\-\-recursive\-function\fP). +.TP +.B \fB\&.date\fP +True if the generated fingerprint should contain generation date. +.TP +.B \fB\&.loop_label\fP +True if re2js generated loops must have a label (\fBre2c:label:yyloop\fP +is set to a nonempty string). +.TP +.B \fB\&.monadic\fP +True if the generated code should be monadic (\fBre2c:monadic = 1\fP). +This is only relevant for pure functional languages. +.TP +.B \fB\&.start_conditions\fP +True if start conditions are enabled (\fB\-\-start\-conditions\fP). +.TP +.B \fB\&.storable_state\fP +True if storable state is enabled (\fB\-\-storable\-state\fP). +.TP +.B \fB\&.unsafe\fP +True if re2js should use \(dqunsafe\(dq blocks in order to generate faster +code (\fB\-\-unsafe\fP, \fBre2c:unsafe = 1\fP). This is only relevant for +languages that have \(dqunsafe\(dq feature. +.TP +.B \fB\&.version\fP +True if the generated fingerprint should contain re2js version. +.UNINDENT +.UNINDENT +.UNINDENT +.SH HANDLING THE END OF INPUT +.sp +One of the main problems for the lexer is to know when to stop. +There are a few terminating conditions: +.INDENT 0.0 +.IP \(bu 2 +the lexer may match some rule (including default rule \fB*\fP) and come to a +final state +.IP \(bu 2 +the lexer may fail to match any rule and come to a default state +.IP \(bu 2 +the lexer may reach the end of input +.UNINDENT +.sp +The first two conditions terminate the lexer in a \(dqnatural\(dq way: it comes to a +state with no outgoing transitions, and the matching automatically stops. The +third condition, end of input, is different: it may happen in any state, and the +lexer should be able to handle it. Checking for the end of input interrupts the +normal lexer workflow and adds conditional branches to the generated program, +therefore it is necessary to minimize the number of such checks. re2js supports +a few different methods for handling the end of input. Which one to use depends +on the complexity of regular expressions, the need for buffering, performance +considerations and other factors. Here is a list of methods: +.INDENT 0.0 +.IP \(bu 2 +\fBSentinel.\fP +This method eliminates the need for the end of input checks altogether. It is +simple and efficient, but limited to the case when there is a natural +\(dqsentinel\(dq character that can never occur in valid input. This character may +still occur in invalid input, but it should not be allowed by the regular +expressions, except perhaps as the last character of a rule. The sentinel is +appended at the end of input and serves as a stop signal: when the lexer reads +this character, it is either a syntax error or the end of input. In both +cases the lexer should stop. This method is used if \fBYYFILL\fP is disabled +with \fBre2c:yyfill:enable = 0;\fP and \fBre2c:eof\fP has the default value +\fB\-1\fP\&. +.nf + +.fi +.sp +.IP \(bu 2 +\fBSentinel with bounds checks.\fP +This method is generic: it allows one to handle any input without restrictions on +the regular expressions. The idea is to reduce the number of end of input +checks by performing them only on certain characters. Similar to the +\(dqsentinel\(dq method, one of the characters is chosen as a \(dqsentinel\(dq and +appended at the end of input. However, there is no restriction on where the +sentinel may occur (in fact, any character can be chosen for a sentinel). +When the lexer reads this character, it additionally performs a bounds check. +If the current position is within bounds, the lexer resumes matching and +handles the sentinel as a regular character. Otherwise it invokes \fBYYFILL\fP +(unless it is disabled). If more input is supplied, the lexer will rematch the +last character and continue as if the sentinel wasn\(aqt there. Otherwise it must +be the real end of input, and the lexer stops. This method is used when +\fBre2c:eof\fP has non\-negative value (it should be set to the numeric value of +the sentinel). \fBYYFILL\fP is optional. +.nf + +.fi +.sp +.IP \(bu 2 +\fBBounds checks with padding.\fP +This method is generic, and it may be faster than the \(dqsentinel with bounds +checks\(dq method, but it is also more complex. The idea is to partition DFA +states into strongly connected components (SCCs) and generate a single check +per SCC for enough characters to cover the longest non\-looping path in this +SCC. This reduces the number of checks, but there is a problem with short +lexemes at the end of input, as the check requires enough characters to cover +the longest lexeme. This can be fixed by padding the input with a few fake +characters that do not form a valid lexeme suffix (so that the lexer cannot +match them). The length of padding should be \fBYYMAXFILL\fP, generated with +a \fBmax\fP block. If there is not enough input, the lexer invokes \fBYYFILL\fP +which should supply at least the required number of characters or not return. +This method is used if \fBYYFILL\fP is enabled and \fBre2c:eof\fP is \fB\-1\fP +(this is the default configuration). +.nf + +.fi +.sp +.IP \(bu 2 +\fBCustom checks.\fP +Generic API allows one to override basic operations like reading a character, +which makes it possible to include the end\-of\-input checks as part of them. +This approach is error\-prone and should be used with caution. To use a custom +method, enable generic API with \fB\-\-api custom\fP or \fBre2c:api = custom;\fP and +disable default bounds checks with \fBre2c:yyfill:enable = 0;\fP or +\fBre2c:yyfill:check = 0;\fP\&. +.UNINDENT +.sp +The following subsections contain an example of each method. +.SS Sentinel +.sp +This example uses a sentinel character to handle the end of input. The program +counts space\-separated words in a null\-terminated string. The sentinel is null: +it is the last character of each input string, and it is not allowed in the +middle of a lexeme by any of the rules (in particular, it is not included in +character ranges where it is easy to overlook). If a null occurs in the middle +of a string, it is a syntax error and the lexer will match default rule \fB*\fP, +but it won\(aqt read past the end of input or crash (use +\fI\%\-Wsentinel\-in\-midrule\fP +warning and \fBre2c:sentinel\fP configuration to verify this). Configuration +\fBre2c:yyfill:enable = 0;\fP suppresses the generation of bounds checks and +\fBYYFILL\fP invocations. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a null\-terminated string +function lex(yyinput) { + let yycursor = 0; + let count = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + + * { return \-1 } + [\ex00] { return count } + [ ]+ { continue loop } + [a\-z]+ { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\e0\(dq, 0) +test(\(dqone two three\e0\(dq, 3) +test(\(dqf0ur\e0\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Sentinel with bounds checks +.sp +This example uses sentinel with bounds checks to handle the end of input (this +method was added in version 1.2). The program counts space\-separated +single\-quoted strings. The sentinel character is null, which is specified with +\fBre2c:eof = 0;\fP configuration. As in the \fI\%sentinel\fP method, null is the last +character of each input string, but it is allowed in the middle of a rule (for +example, \fB\(aqaaa\e0aa\(aq\e0\fP is valid input, but \fB\(aqaaa\e0\fP is a syntax error). +Bounds checks are generated in each state that matches an input character, but +they are scoped to the branch that handles null. Bounds checks are of the form +\fBYYLIMIT <= YYCURSOR\fP or \fBYYLESSTHAN(1)\fP with generic API. If the check +condition is true, lexer has reached the end of input and should stop +(\fBYYFILL\fP is disabled with \fBre2c:yyfill:enable = 0;\fP as the input fits into +one buffer, see the \fI\%YYFILL with sentinel\fP section for an example that uses +\fBYYFILL\fP). Reaching the end of input opens three possibilities: if the lexer +is in the initial state it will match the end\-of\-input rule \fB$\fP, otherwise it +may fallback to a previously matched rule (including default rule \fB*\fP) or go +to a default state, causing +\fI\%\-Wundefined\-control\-flow\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a null\-terminated string +function lex(yyinput) { + let yycursor = 0; + let yylimit = yyinput.length \- 1 // terminating null not included + let count = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + [ ]+ { continue loop } + str { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\e0\(dq, 0) +test(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \e0\(dq, 3) +test(\(dq\(aqunterminated\e\e\(aq\e0\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Bounds checks with padding +.sp +This example uses bounds checks with padding to handle the end of input (this +method is enabled by default). The program counts space\-separated single\-quoted +strings. There is a padding of \fBYYMAXFILL\fP null characters appended at the end +of input, where \fBYYMAXFILL\fP value is autogenerated with a \fBmax\fP block. It +is not necessary to use null for padding \-\-\- any characters can be used as long +as they do not form a valid lexeme suffix (in this example padding should not +contain single quotes, as they may be mistaken for a suffix of a single\-quoted +string). There is a \(dqstop\(dq rule that matches the first padding character (null) +and terminates the lexer (note that it checks if null is at the beginning of +padding, otherwise it is a syntax error). Bounds checks are generated only in +some states that are determined by the strongly connected components of the +underlying automaton. Checks have the form \fB(YYLIMIT \- YYCURSOR) < n\fP or +\fBYYLESSTHAN(n)\fP with generic API, where \fBn\fP is the minimum number of +characters that are needed for the lexer to proceed (it also means that the next +bounds check will occur in at most \fBn\fP characters). If the check condition is +true, the lexer has reached the end of input and will invoke \fBYYFILL(n)\fP that +should either supply at least \fBn\fP input characters or not return. In this +example \fBYYFILL\fP always fails and terminates the lexer with an error (which is +fine because the input fits into one buffer). See the \fI\%YYFILL with padding\fP +section for an example that refills the input buffer with \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!max:re2c*/ + +function lex(yyinput) { + let yycursor = 0; + let yylimit = yyinput.length + let count = 0 + + loop: while (true) { + /*!re2c + re2c:YYFILL = \(dqreturn \-1\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // check that it is the sentinel, not some unexpected null + return (yycursor == yylimit \- YYMAXFILL + 1) ? count : \-1 + } + str { count += 1; continue loop } + [ ]+ { continue loop } + * { return \-1 } + */ + } +} + +function test(s, n) { + let padded_s = s + \(dq\e0\(dq.repeat(YYMAXFILL) + if (lex(padded_s) != n) throw \(dqerror!\(dq +} + +test(\(dq\(dq, 0) +test(\(dq\(aqunterminated\e\e\(aq\(dq, \-1) +test(\(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq, 3) +test(\(dq\(aqunexpected \e0 null\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Custom checks +.sp +This example uses a custom end\-of\-input handling method based on generic API. +The program counts space\-separated single\-quoted strings. It is the same as the +\fI\%sentinel\fP example, except that the input is not null\-terminated. To cover up +for the absence of a sentinel character at the end of input, \fBYYPEEK\fP is +redefined to perform a bounds check before it reads the next input character. +This is inefficient because checks are done very often. If the check condition +fails, \fBYYPEEK\fP returns the real character, otherwise it returns a fake +sentinel character. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// expects a string without terminating null +function lex(str) { + let cur = 0; + let lim = str.length + let count = 0 + + loop: while (true) { + /*!re2c + re2c:api = generic; + re2c:YYPEEK = \(dqcur < lim ? str.charCodeAt(cur) : 0\(dq; + re2c:YYSKIP = \(dqcur += 1\(dq; + re2c:yyfill:enable = 0; + + * { return \-1 } + [\ex00] { return count } + [ ]+ { continue loop } + [a\-z]+ { count += 1; continue loop } + */ + } +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } +test(\(dq\(dq, 0) +test(\(dqone two three\(dq, 3) +test(\(dqf0ur\(dq, \-1) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH BUFFER REFILLING +.sp +The need for buffering arises when the input cannot be mapped in memory all at +once: either it is too large, or it comes in a streaming fashion (like reading +from a socket). The usual technique in such cases is to allocate a fixed\-sized +memory buffer and process input in chunks that fit into the buffer. When the +current chunk is processed, it is moved out and new data is moved in. In +practice it is somewhat more complex, because lexer state consists not of a +single input position, but a set of interrelated positions: +.INDENT 0.0 +.IP \(bu 2 +cursor: the next input character to be read (\fBYYCURSOR\fP in C pointer API or +\fBYYSKIP\fP/\fBYYPEEK\fP in generic API) +.IP \(bu 2 +limit: the position after the last available input character (\fBYYLIMIT\fP in +C pointer API, implicitly handled by \fBYYLESSTHAN\fP in generic API) +.IP \(bu 2 +marker: the position of the most recent match, if any (\fBYYMARKER\fP in default +API or \fBYYBACKUP\fP/\fBYYRESTORE\fP in generic API) +.IP \(bu 2 +token: the start of the current lexeme (implicit in re2js API, as it is not +needed for the normal lexer operation and can be defined and updated by the +user) +.IP \(bu 2 +context marker: the position of the trailing context (\fBYYCTXMARKER\fP in +C pointer API or \fBYYBACKUPCTX\fP/\fBYYRESTORECTX\fP in generic API) +.IP \(bu 2 +tag variables: submatch positions (defined with \fBstags\fP and \fBmtags\fP blocks +and generic API primitives \fBYYSTAGP\fP/\fBYYSTAGN\fP/\fBYYMTAGP\fP/\fBYYMTAGN\fP) +.UNINDENT +.sp +Not all these are used in every case, but if used, they must be updated by +\fBYYFILL\fP\&. All active positions are contained in the segment between token and +cursor, therefore everything between buffer start and token can be discarded, +the segment from token and up to limit should be moved to the beginning of +buffer, and the free space at the end of buffer should be filled with new data. +In order to avoid frequent \fBYYFILL\fP calls it is best to fill in as many input +characters as possible (even though fewer characters might suffice to resume the +lexer). The details of \fBYYFILL\fP implementation are slightly different +depending on which EOF handling method is used: the case of EOF rule is somewhat +simpler than the case of bounds\-checking with padding. Also note that if +\fB\-f \-\-storable\-state\fP option is used, \fBYYFILL\fP has slightly different +semantics (described in the section about storable state). +.SS YYFILL with sentinel +.sp +If EOF rule is used, \fBYYFILL\fP is a function\-like primitive that accepts +no arguments and returns a value which is checked against zero. \fBYYFILL\fP +invocation is triggered by condition \fBYYLIMIT <= YYCURSOR\fP in C pointer API and +\fBYYLESSTHAN()\fP in generic API. A non\-zero return value means that \fBYYFILL\fP +has failed. A successful \fBYYFILL\fP call must supply at least one character and +adjust input positions accordingly. Limit must always be set to one after the +last input position in buffer, and the character at the limit position must be +the sentinel symbol specified by \fBre2c:eof\fP configuration. The pictures below +show the relative locations of input positions in buffer before and after +\fBYYFILL\fP call (sentinel symbol is marked with \fB#\fP, and the second picture +shows the case when there is not enough input to fill the whole buffer). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-\-\-\-\-\-\-\-\-\-E\-> + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-\-\-\-\-\-\-\-\-\-E#\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D#\-\-E (EOF) + buffer token marker limit, + cursor +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-\-\-\-\-\-\-\-\-D\-\-\-E#........ + buffer, marker cursor limit + token +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses EOF rule. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:YYPEEK = \(dqreadUInt8\(dq; + re2c:YYFILL = \(dqfill(yyrecord) == OK\(dq; + re2c:eof = 0; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + * { return \-1 } + $ { return count } + [ ]+ { continue loop } + str { count += 1; continue loop } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw \(dqerror :[\(dq } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS YYFILL with padding +.sp +In the default case (when EOF rule is not used) \fBYYFILL\fP is a function\-like +primitive that accepts a single argument and does not return any value. +\fBYYFILL\fP invocation is triggered by condition \fB(YYLIMIT \- YYCURSOR) < n\fP in +C pointer API and \fBYYLESSTHAN(n)\fP in generic API. The argument passed to +\fBYYFILL\fP is the minimal number of characters that must be supplied. If it +fails to do so, \fBYYFILL\fP must not return to the lexer (for that reason it is +best implemented as a macro that returns from the calling function on failure). +In case of a successful \fBYYFILL\fP invocation the limit position must be set +either to one after the last input position in buffer, or to the end of +\fBYYMAXFILL\fP padding (in case \fBYYFILL\fP has successfully read at least \fBn\fP +characters, but not enough to fill the entire buffer). The pictures below show +the relative locations of input positions in buffer before and after \fBYYFILL\fP +invocation (\fBYYMAXFILL\fP padding on the second picture is marked with \fB#\fP +symbols). +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-\-\-F\-\-\-\-\-\-\-\-G\-> + buffer, marker cursor limit + token + + <\-\- shift \-\-> <\-\- need \-\-> + >\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F (EOF) + buffer token marker cursor limit + +>\-A\-\-\-\-\-\-\-\-\-\-\-\-B\-\-\-\-\-\-\-\-\-C\-\-\-\-\-D\-\-\-\-\-\-\-E\-F############### + buffer, marker cursor limit + token <\- YYMAXFILL \-> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of a program that reads input file \fBinput.txt\fP in chunks of +4096 bytes and uses bounds\-checking with padding. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 +/*!max:re2c*/ + +function fill(st, need) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < need) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + if (nread < want) { + st.eof = true // end of file + st.yyinput.write(\(dq\e0\(dq.repeat(YYMAXFILL), st.yylimit) + st.yylimit += YYMAXFILL + } + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:YYPEEK = \(dqreadUInt8\(dq; + re2c:YYFILL = \(dqif (fill(yyrecord, @@) != OK) return \-1;\(dq; + + str = [\(aq] ([^\(aq\e\e] | [\e\e][^])* [\(aq]; + + [\ex00] { + // Check that it is the sentinel, not some unexpected null. + return yyrecord.token == yyrecord.yylimit \- YYMAXFILL ? count : \-1 + } + str { count += 1; continue loop } + [ ]+ { continue loop } + * { return \-1 } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq\(aqqu\e0tes\(aq \(aqare\(aq \(aqfine: \e\e\(aq\(aq \(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw \(dqerror :[\(dq } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SH FEATURES +.SS Multiple blocks +.sp +Sometimes it is necessary to have multiple interrelated lexers (for example, if +there is a high\-level state machine that transitions between lexer modes). This +can be implemented using multiple connected re2js blocks. Another option is to +use \fI\%start conditions\fP\&. +.sp +The implementation of connections between blocks depends on the target language. +In languages that have \fBgoto\fP statement (such as C/C++ and Go) one can have +all blocks in one function, each of them prefixed with a label. Transition from +one block to another is a simple \fBgoto\fP\&. +In languages that do not have \fBgoto\fP (such as Rust) it is necessary to use a +loop with a switch on a state variable, similar to the \fByystate\fP loop/switch +generated by re2js, or else wrap each block in a function and use function calls. +.sp +The example below uses multiple blocks to parse binary, octal, decimal and +hexadecimal numbers. Each base has its own block. The initial block determines +base and dispatches to other blocks. Common configurations are defined in a +separate block at the beginning of the program; they are inherited by the other +blocks. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!re2c // Common re2c definitions shared between all functions. + re2c:api = record; + re2c:yyrecord = st; + re2c:yyfill:enable = 0; +*/ + +function parse_u32(str) { + let st = { + yyinput: str, + yycursor: 0, + yymarker: 0 + } + /*!re2c + \(aq0b\(aq / [01] { return parse_bin(st) } + \(dq0\(dq { return parse_oct(st) } + \(dq\(dq / [1\-9] { return parse_dec(st) } + \(aq0x\(aq / [0\-9a\-fA\-F] { return parse_hex(st) } + * { return null } + */ +} + +function parse_bin(st) { + n = 0 + loop: while (true) { + /*!re2c + [01] { n = n * 2 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_oct(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-7] { n = n * 8 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_dec(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-9] { n = n * 10 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + * { return n } + */ + } +} + +function parse_hex(st) { + n = 0 + loop: while (true) { + /*!re2c + [0\-9] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 48); continue loop } + [a\-f] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 87); continue loop } + [A\-F] { n = n * 16 + (st.yyinput.charCodeAt(st.yycursor \- 1) \- 55); continue loop } + * { return n } + */ + } +} + +function test(s, n) { + if (parse_u32(s) != n) throw \(dqerror!\(dq +} + +test(\(dq\e0\(dq, null) +test(\(dq1234567890\e0\(dq, 1234567890) +test(\(dq0b1101\e0\(dq, 13) +test(\(dq0x7Fe\e0\(dq, 2046) +test(\(dq0644\e0\(dq, 420) +test(\(dq9999999999\e0\(dq, 9999999999) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Start conditions +.sp +Start conditions are enabled with \fB\-\-start\-conditions\fP option. They provide a +way to encode multiple interrelated automata within the same re2js block. +.sp +Each condition corresponds to a single automaton and has a unique name specified +by the user and a unique internal number defined by re2js\&. The numbers are used +to switch between conditions: the generated code uses \fBYYGETCOND\fP and +\fBYYSETCOND\fP primitives to get the current condition or set it to the +given number. Use \fBconditions\fP block, \fB\-\-header\fP option or \fBre2c:header\fP +configuration to generate numeric condition identifiers. Configuration +\fBre2c:cond:enumprefix\fP specifies the generated identifier prefix. +.sp +In condition mode every rule must be prefixed with a list of comma\-separated +condition names in angle brackets, or a wildcard \fB<*>\fP to denote all +conditions. The rule syntax is extended as follows: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.TP +.B \fB< cond\-list > regexp action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp => cond action\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP, sets the current condition to \fBcond\fP and +executes the associated \fBaction\fP\&. +.TP +.B \fB< cond\-list > regexp :=> cond\fP +A rule that is merged to every condition on the \fBcond\-list\fP\&. +It matches \fBregexp\fP and immediately transitions to \fBcond\fP (there is +no semantic action). +.TP +.B \fB action\fP +The \fBaction\fP is prepended to semantic actions of all rules for every +condition on the \fBcond\-list\fP\&. This may be used to deduplicate common +code. +.TP +.B \fB< > action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and executes the \fBaction\fP\&. +.TP +.B \fB< > => cond action\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string, sets the current condition to +\fBcond\fP and executes the \fBaction\fP\&. +.TP +.B \fB< > :=> cond\fP +A rule that is merged to a special entry condition with number zero +and name \fB\(dq0\(dq\fP\&. It matches empty string and immediately transitions to +\fBcond\fP\&. +.UNINDENT +.UNINDENT +.UNINDENT +.sp +The code re2js generates for conditions depends on whether re2js uses goto/label +approach or loop/switch approach to encode the automata. +.sp +In languages that have \fBgoto\fP statement (such as C/C++ and Go) conditions are +naturally implemented as blocks of code prefixed with labels of the form +\fByyc_\fP, where \fBcond\fP is a condition name (label prefix can be changed +with \fBre2c:cond:prefix\fP). Transitions between conditions are implemented using +\fBgoto\fP and condition labels. Before all conditions re2js generates an initial +switch on \fBYYGETSTATE\fP that jumps to the start state of the current condition. +The shortcut rules \fB:=>\fP bypass the initial switch and jump directly to the +specified condition (\fBre2c:cond:goto\fP can be used to change the default +behavior). The rules with semantic actions do not automatically jump to the next +condition; this should be done by the user\-defined action code. +.sp +In languages that do not have \fBgoto\fP (such as Rust) re2js reuses the +\fByystate\fP variable to store condition numbers. Each condition gets a numeric +identifier equal to the number of its start state, and a switch between +conditions is no different than a switch between DFA states of a single +condition. There is no need for a separate initial condition switch. +(Since the same approach is used to implement storable states, +\fBYYGETCOND\fP/\fBYYSETCOND\fP are redundant if both storable states and +conditions are used). +.sp +The program below uses start conditions to parse binary, octal, decimal and +hexadecimal numbers. There is a single block where each base has its own +condition, and the initial condition is connected to all of them. User\-defined +variable \fBcond\fP stores the current condition number; it is initialized to the +number of the initial condition generated with \fBconditions\fP block. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-c + +/*!conditions:re2c*/ + +function parse_u32(yyinput) { + let yycursor = 0 + let yycond = YYC_INIT + let n = 0 + + loop: while (true) { + /*!re2c + re2c:yyfill:enable = 0; + re2c:indent:top = 2; + + \(aq0b\(aq / [01] :=> BIN + \(dq0\(dq :=> OCT + \(dq\(dq / [1\-9] :=> DEC + \(aq0x\(aq / [0\-9a\-fA\-F] :=> HEX + * { return null } + + [01] { n = n * 2 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-7] { n = n * 8 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-9] { n = n * 10 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [0\-9] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 48); continue loop } + [a\-f] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 87); continue loop } + [A\-F] { n = n * 16 + (yyinput.charCodeAt(yycursor \- 1) \- 55); continue loop } + + * { return n } + */ + } +} + +function test(s, n) { + if (parse_u32(s) != n) throw \(dqerror!\(dq +} + +test(\(dq\e0\(dq, null) +test(\(dq1234567890\e0\(dq, 1234567890) +test(\(dq0b1101\e0\(dq, 13) +test(\(dq0x7Fe\e0\(dq, 2046) +test(\(dq0644\e0\(dq, 420) +test(\(dq9999999999\e0\(dq, 9999999999) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Storable state +.sp +With \fB\-\-storable\-state\fP option re2js generates a lexer that can store +its current state, return to the caller, and later resume operations exactly +where it left off. The default mode of operation in re2js is a \(dqpull\(dq model, +in which the lexer \(dqpulls\(dq more input whenever it needs it. This may be +unacceptable in cases when the input becomes available piece by piece (for +example, if the lexer is invoked by the parser, or if the lexer program +communicates via a socket protocol with some other program that must wait for a +reply from the lexer before it transmits the next message). Storable state +feature is intended exactly for such cases: it allows one to generate lexers that +work in a \(dqpush\(dq model. When the lexer needs more input, it stores its state and +returns to the caller. Later, when more input becomes available, the caller +resumes the lexer exactly where it stopped. There are a few changes necessary +compared to the \(dqpull\(dq model: +.INDENT 0.0 +.IP \(bu 2 +Define \fBYYSETSTATE()\fP and \fBYYGETSTATE(state)\fP primitives. +.IP \(bu 2 +Define \fByych\fP, \fByyaccept\fP (if used) and \fBstate\fP variables as a part of +persistent lexer state. The \fBstate\fP variable should be initialized to \fB\-1\fP\&. +.IP \(bu 2 +\fBYYFILL\fP should return to the outer program instead of trying to supply more +input. Return code should indicate that lexer needs more input. +.IP \(bu 2 +The outer program should recognize situations when lexer needs more input and +respond appropriately. +.IP \(bu 2 +Optionally use \fBgetstate\fP block to generate \fBYYGETSTATE\fP switch detached +from the main lexer. This only works for languages that have \fBgoto\fP (not in +\fB\-\-loop\-switch\fP mode). +.IP \(bu 2 +Use \fBre2c:eof\fP and the \fI\%sentinel with bounds checks\fP method to handle the +end of input. Padding\-based method may not work because it is unclear when to +append padding: the current end of input may not be the ultimate end of input, +and appending padding too early may cut off a partially read greedy lexeme. +Furthermore, due to high\-level program logic getting more input may depend on +processing the lexeme at the end of buffer (which already is blocked due to +the end\-of\-input condition). +.UNINDENT +.sp +Here is an example of a \(dqpush\(dq model lexer that simulates reading packets from a +socket. The lexer loops until it encounters the end of input and returns to the +calling function. The calling function provides more input by \(dqsending\(dq the next +packet and resumes lexing. This process stops when all the packets have been +sent, or when there is an error. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-f + +const fs = require(\(aqfs\(aq) + +// Use a small buffer to cover the case when a lexeme doesn\(aqt fit. +// In real world use a larger buffer. +const BUFSIZE = 10 +const DEBUG = false +const END = 0 +const READY = 1 +const WAITING = 2 +const BIG_PACKET = 3 +const BAD_PACKET = 4 + +function log() { + if (DEBUG) console.log.apply(console, arguments) +} + +function fill(st) { + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return BIG_PACKET + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return READY +} + +function lex(yyrecord) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:YYPEEK = \(dqreadUInt8\(dq; + re2c:YYFILL = \(dqreturn WAITING\(dq; + re2c:eof = 0; + + packet = [a\-z]+[;]; + + * { return BAD_PACKET } + $ { return END } + packet { yyrecord.received += 1; continue loop } + */ + } +} + +function test(packets, expect) { + // Emulate a \(dqpipe\(dq by opening the same file for reading and writing. + let fname = \(dqinput\(dq + let fw = fs.openSync(fname, \(aqw\(aq); + let fr = fs.openSync(fname, \(aqr\(aq); + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fr, + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + yystate: \-1, + received: 0 + } + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + let send = 0 + let status + loop: while (true) { + status = lex(st) + + if (status == END) { + log(\(dqdone: got\(dq, st.received, \(dqpackets\(dq) + break loop + } else if (status == WAITING) { + log(\(dqwaiting...\(dq); + + if (send < packets.length) { + log(\(dqsent packet\(dq, send, packets[send]) + fs.writeFileSync(fw, packets[send]) + send += 1 + } + + status = fill(st) + log(\(dqqueue:\(dq, st.yyinput.toString()) + if (status == BIG_PACKET) { + log(\(dqerror: packet too big\(dq) + break loop + } + + if (status != READY) throw \(dqexpected READY\(dq + } else { + if (status != BAD_PACKET) throw \(dqexpected BAD_PACKET\(dq + log(\(dqerror: ill\-formed packet\(dq) + break loop + } + } + + // Check results. + if (status != expect) throw \(dqunexpected status\(dq + if (status == END && st.received != send) \(dqunexpected packet count\(dq + + // Cleanup. + fs.unlinkSync(fname, function(err){ if (err) throw err; }) +} + +function main() { + test([], END) + test([\(dqzero;\(dq, \(dqone;\(dq, \(dqtwo;\(dq, \(dqthree;\(dq, \(dqfour;\(dq], END) + test([\(dqzer0;\(dq], BAD_PACKET) + test([\(dqgoooooooooogle;\(dq], BIG_PACKET) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Reusable blocks +.sp +Reusable blocks of the form \fB/*!rules:re2c[:] ... */\fP or +\fB%{rules[:] ... %}\fP can be reused any number of times and combined with +other re2js blocks. The \fB\fP is optional. A rules block can be used in a +\fBuse\fP block or directive. The code for a rules block is generated at every +point of use. +.sp +Use blocks are defined with \fB/*!use:re2c[:] ... */\fP or +\fB%{use[:] ... %}\fP\&. The \fB\fP is optional: if it\(aqs not specified, +the associated rules block is the most recent one (whether named or unnamed). +A use block can add named definitions, configurations and rules of its own. +An important use case for use blocks is a lexer that supports multiple input +encodings: the same rules block is reused multiple times with encoding\-specific +configurations (see the example below). +.sp +In\-block use directive \fB!use:;\fP can be used from inside of a re2js +block. It merges the referenced block \fB\fP into the current one. If some +of the merged rules and configurations overlap with the previously defined ones, +conflicts are resolved in the usual way: the earliest rule takes priority, and +latest configuration overrides preceding ones. One exception are the special +rules \fB*\fP, \fB$\fP and (in condition mode) \fB\fP, for which a block\-local +definition overrides any inherited ones. Use directive allows one to combine +different re2js blocks together in one block (see the example below). +.sp +Named blocks and in\-block use directive were added in re2js version 2.2. +Since that version reusable blocks are allowed by default (no special option +is needed). Before version 2.2 reuse mode was enabled with \fB\-r \-\-reusable\fP +option. Before version 1.2 reusable blocks could not be mixed with normal +blocks. +.SS Example of a \fB!use\fP directive +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// (\(aqcolors\(aq and \(aqfish\(aq) are merged into one. The \(aqsalmon\(aq rule occurs +// in both blocks; the \(aqfish\(aq block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +const COLOR = 1 +const FISH = 2 +const DUNNO = 3 + +/*!rules:re2c:colors + * { throw \(dqah\(dq } + \(dqred\(dq | \(dqsalmon\(dq | \(dqmagenta\(dq { return COLOR } +*/ + +/*!rules:re2c:fish + * { throw \(dqoh\(dq } + \(dqhaddock\(dq | \(dqsalmon\(dq | \(dqeel\(dq { return FISH } +*/ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return DUNNO } // overrides inherited \(aq*\(aq rules + */ +} + +function test(s, n) { if (lex(s) != n) throw \(dqerror!\(dq; } + +test(\(dqsalmon\(dq, FISH) +test(\(dqwhat?\(dq, DUNNO) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Example of a \fB/*!use:re2c ... */\fP block +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-input\-encoding utf8 + +// This example supports multiple input encodings: UTF\-8 and UTF\-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding\-specific configurations. +/*!rules:re2c + re2c:yyfill:enable = 0; + re2c:YYPEEK = \(dqat\(dq; + + \(dq∀x ∃y\(dq { return yycursor } + * { return null } +*/ + +function lex_utf8(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf8 = 1; + */ +} + +function lex_utf32(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf32 = 1; + */ +} + +function test(f, s) { + if (f(s) != s.length) throw \(dqerror!\(dq +} + +test(lex_utf8, [0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79]) +test(lex_utf32, [0x2200, 0x78, 0x20, 0x2203, 0x79]) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Submatch extraction +.sp +re2js has two options for submatch extraction. +.INDENT 0.0 +.TP +.B \fBTags\fP +The first option is to use standalone \fItags\fP of the form \fB@stag\fP or +\fB#mtag\fP, where \fBstag\fP and \fBmtag\fP are arbitrary used\-defined names. +Tags are enabled with \fB\-T \-\-tags\fP option or \fBre2c:tags = 1\fP +configuration. Semantically tags are position markers: they can be +inserted anywhere in a regular expression, and they bind to the +corresponding position (or multiple positions) in the input string. +\fIS\-tags\fP bind to the last matching position, and \fIm\-tags\fP bind to a list of +positions (they may be used in repetition subexpressions, where a single +position in a regular expression corresponds to multiple positions in the +input string). All tags should be defined by the user, either manually or +with the help of \fBsvars\fP and \fBmvars\fP blocks. If there is more than one +way tags can be matched against the input, ambiguity is resolved using +leftmost greedy disambiguation strategy. +.TP +.B \fBCaptures\fP +The second option is to use \fIcapturing groups\fP\&. They are enabled with +\fB\-\-captures\fP option or \fBre2c:captures = 1\fP configuration. There are two +flavours for different disambiguation policies, \fB\-\-leftmost\-captures\fP +(the default) is for leftmost greedy policy, and, \fB\-\-posix\-captures\fP is +for POSIX longest\-match policy. In this mode all parenthesized +subexpressions are considered capturing groups, and a bang can be used to +mark non\-capturing groups: \fB(! ... )\fP\&. With \fB\-\-invert\-captures\fP option or +\fBre2c:invert\-captures = 1\fP configuration the meaning of bang is inverted. +The number of groups for the matching rule is stored in a variable +\fByynmatch\fP (the whole regular expression is group number zero), and +submatch results are stored in \fByypmatch\fP array. Both \fByynmatch\fP and +\fByypmatch\fP should be defined by the user, and \fByypmatch\fP size must be at +least \fB[yynmatch * 2]\fP\&. Use \fBmaxnmatch\fP block to define \fBYYMAXNMATCH\fP, +a constant that equals to the maximum value of \fByynmatch\fP among all rules. +.TP +.B \fBCaptvars\fP +Another way to use capturing groups is the \fB\-\-captvars\fP option or +\fBre2c:captvars = 1\fP configuration. The only difference with \fB\-\-captures\fP +is in the way the generated code stores submatch results: instead of +\fByynmatch\fP and \fByypmatch\fP re2js generates variables \fByytl\fP and +\fByytr\fP for \fIk\fP\-th capturing group (the user should declare these using +an \fBsvars\fP block). Captures with variables support two disambiguation +policies: \fB\-\-leftmost\-captvars\fP or \fBre2c:leftmost\-captvars = 1\fP for +leftmost greedy policy (the default one) and \fB\-\-posix\-captvars\fP or +\fBre2c:posix\-captvars\fP for POSIX longest\-match policy. +.UNINDENT +.sp +Under the hood all these options translate into tags and +\fI\%Tagged Deterministic Finite Automata with Lookahead\fP\&. +The core idea of TDFA is to minimize the overhead on submatch extraction. +In the extreme, if there\(aqre no tags or captures in a regular expression, TDFA is +just an ordinary DFA. If the number of tags is moderate, the overhead is barely +noticeable. The generated TDFA uses a number of \fItag variables\fP which do not map +directly to tags: a single variable may be used for different tags, and a tag +may require multiple variables to hold all its possible values. Eventually +ambiguity is resolved, and only one final variable per tag survives. Tag +variables should be defined using \fBstags\fP or \fBmtags\fP blocks. If lexer state +is stored, tag variables should be part of it. They also need to be updated by +\fBYYFILL\fP\&. +.sp +S\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +save input position to an s\-tag: \fBt = YYCURSOR\fP with C pointer API or a +user\-defined operation \fBYYSTAGP(t)\fP with generic API +.IP \(bu 2 +save default value to an s\-tag: \fBt = NULL\fP with C pointer API or a +user\-defined operation \fBYYSTAGN(t)\fP with generic API +.IP \(bu 2 +copy one s\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +M\-tags support the following operations: +.INDENT 0.0 +.IP \(bu 2 +append input position to an m\-tag: a user\-defined operation \fBYYMTAGP(t)\fP +with both default and generic API +.IP \(bu 2 +append default value to an m\-tag: a user\-defined operation \fBYYMTAGN(t)\fP +with both default and generic API +.IP \(bu 2 +copy one m\-tag to another: \fBt1 = t2\fP +.UNINDENT +.sp +S\-tags can be implemented as scalar values (pointers or offsets). M\-tags need a +more complex representation, as they need to store a sequence of tag values. The +most naive and inefficient representation of an m\-tag is a list (array, vector) +of tag values; a more efficient representation is to store all m\-tags in a +prefix\-tree represented as array of nodes \fB(v, p)\fP, where \fBv\fP is tag value +and \fBp\fP is a pointer to parent node. +.sp +Here is a simple example of using s\-tags to parse semantic versions consisting +of three numeric components: major, minor, patch (the latter is optional). +See below for a more complex example that uses \fBYYFILL\fP\&. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 \(dq.\(dq @t3 num @t4 (\(dq.\(dq @t5 num)? [\ex00] { + return { + major: Number(yyinput.substring(t1, t2)), + minor: Number(yyinput.substring(t3, t4)), + patch: t5 == \-1 ? 0 : Number(yyinput.substring(t5, yycursor \- 1)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq23.34\e0\(dq), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse(\(dq1.2.99999\e0\(dq), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse(\(dq1.a\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is a more complex example of using s\-tags with \fBYYFILL\fP to parse a file +with newline\-separated semantic versions. Tag variables are part of the lexer +state, and they are adjusted in \fBYYFILL\fP like other input positions. +Note that it is necessary for s\-tags because their values are invalidated after +shifting buffer contents. It may not be necessary in a custom implementation +where tag variables store offsets relative to the start of the input string +rather than the buffer, which may be the case with m\-tags. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); +const fs = require(\(aqfs\(aq) + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor \-= st.token; + st.yymarker \-= st.token; + st.yylimit \-= st.token; + /*!stags:re2c format = \(dqif (st.@@ != \-1) st.@@ \-= st.token\en\(dq; */ + st.token = 0; + + // Read a new chunk of data from file and append it to \(gayyinput\(ga. + let want = BUFSIZE \- st.yylimit \- 1 // \-1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(st) { + let vers = [] + loop: while (true) { + st.token = st.yycursor + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:api = record; + re2c:yyrecord = st; + re2c:YYPEEK = \(dqreadUInt8\(dq; + re2c:YYFILL = \(dqfill(st) == OK\(dq; + re2c:eof = 0; + re2c:tags = 1; + + num = [0\-9]+; + + num @t1 \(dq.\(dq @t2 num @t3 (\(dq.\(dq @t4 num)? [\en] { + vers.push({ + major: Number(st.yyinput.subarray(st.token, t1)), + minor: Number(st.yyinput.subarray(t2, t3)), + patch: t4 == \-1 ? 0 : Number(st.yyinput.subarray(t4, st.yycursor \- 1)) + }) + continue loop + } + $ { return vers } + * { return null } + */ + } +} + +function main() { + let fname = \(dqinput\(dq + + // Create input file. + let content = \(dq1.22.333\en\(dq.repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE \- 1 // exclude terminating null + let st = { + file: fs.openSync(fname, \(aqr\(aq), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dq@@: \-1,\en\(dq; */ + eof: false + } + + // Run lexer on the prepared file. + assert.deepEqual(lex(st), Array(BUFSIZE).fill({major: 1, minor: 22, patch: 333})) + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using capturing groups to parse semantic versions. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0\-9]+; + + (num) \(dq.\(dq (num) (\(dq.\(dq num)? [\ex00] { + return { + major: Number(yyinput.substring(yytl1, yytr1)), + minor: Number(yyinput.substring(yytl2, yytr2)), + patch: yytl3 == \-1 ? 0 : Number(yyinput.substring(yytl3 + 1, yytr3)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq23.34\e0\(dq), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse(\(dq1.2.99999\e0\(dq), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse(\(dq1.a\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Here is an example of using m\-tags to parse a version with a variable number of +components. Tag variables are stored in a trie. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +const assert = require(\(aqassert\(aq) + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = \(dqlet @@\en\(dq; */ + /*!mvars:re2c format = \(dqlet @@\en\(dq; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = \(dqlet @@\en\(dq; */ + /*!mtags:re2c format = \(dqlet @@ = []\en\(dq; */ + + /*!re2c + re2c:YYMTAGP = \(dq@@.push(yycursor)\(dq; + re2c:YYMTAGN = \(dq\(dq; // do nothing + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0\-9]+; + + @t1 num @t2 (\(dq.\(dq #t3 num #t4)* [\ex00] { + let vers = [Number(yyinput.substring(t1, t2))] + for (let i = 0; i < t3.length; ++i) { + vers.push(Number(yyinput.substring(t3[i], t4[i]))) + } + return vers + } + * { return null } + */ +} + +assert.deepEqual(parse(\(dq1\e0\(dq), [1]) +assert.deepEqual(parse(\(dq1.2.3.4.5.6.7\e0\(dq), [1, 2, 3, 4, 5, 6, 7]) +assert.deepEqual(parse(\(dq1.2.\e0\(dq), null) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Encoding support +.sp +It is necessary to understand the difference between \fBcode points\fP and +\fBcode units\fP\&. A code point is a numeric identifier of a symbol. A code unit is +the smallest unit of storage in the encoded text. A single code point may be +represented with one or more code units. In a fixed\-length encoding all code +points are represented with the same number of code units. In a variable\-length +encoding code points may be represented with a different number of code units. +Note that the \(dqany\(dq rule \fB[^]\fP matches any code point, but not necessarily +any code unit (the only way to match any code unit regardless of the encoding +is the default rule \fB*\fP). +The generated lexer works with a stream of code units: \fByych\fP stores a code +unit, and \fBYYCTYPE\fP is the code unit type. Regular expressions, on the other +hand, are specified in terms of code points. When re2js compiles regular +expressions to automata it translates code points to code units. This is +generally not a simple mapping: in variable\-length encodings a single code point +range may get translated to a complex code unit graph. +The following encodings are supported: +.INDENT 0.0 +.IP \(bu 2 +\fBASCII\fP (enabled by default). It is a fixed\-length encoding with code space +\fB[0\-255]\fP and 1\-byte code points and code units. +.IP \(bu 2 +\fBEBCDIC\fP (enabled with \fB\-\-ebcdic\fP or \fBre2c:encoding:ebcdic\fP). It is a +fixed\-length encoding with code space \fB[0\-255]\fP and 1\-byte code points and +code units. +.IP \(bu 2 +\fBUCS2\fP (enabled with \fB\-\-ucs2\fP or \fBre2c:encoding:ucs2\fP). It is a +fixed\-length encoding with code space \fB[0\-0xFFFF]\fP and 2\-byte code points +and code units. +.IP \(bu 2 +\fBUTF8\fP (enabled with \fB\-\-utf8\fP or \fBre2c:encoding:utf8\fP). It is a +variable\-length Unicode encoding. Code unit size is 1 byte. Code points are +represented with 1 \-\- 4 code units. +.IP \(bu 2 +\fBUTF16\fP (enabled with \fB\-\-utf16\fP or \fBre2c:encoding:utf16\fP). It is a +variable\-length Unicode encoding. Code unit size is 2 bytes. Code points are +represented with 1 \-\- 2 code units. +.IP \(bu 2 +\fBUTF32\fP (enabled with \fB\-\-utf32\fP or \fBre2c:encoding:utf32\fP). It is a +fixed\-length Unicode encoding with code space \fB[0\-0x10FFFF]\fP and 4\-byte code +points and code units. +.UNINDENT +.sp +Include file \fBinclude/unicode_categories.re\fP provides re2js definitions for the +standard Unicode categories. +.sp +Option \fB\-\-input\-encoding\fP specifies source file encoding, which can be used to +enable Unicode literals in regular expressions. For example +\fB\-\-input\-encoding utf8\fP tells re2js that the source file is in UTF8 (it differs +from \fB\-\-utf8\fP which sets input text encoding). Option \fB\-\-encoding\-policy\fP +specifies the way re2js handles Unicode surrogates (code points in range +\fB[0xD800\-0xDFFF]\fP). +.sp +Below is an example of a lexer for UTF8 encoded Unicode identifiers. +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-utf8 \-s + +/*!include:re2c \(dqunicode_categories.re\(dq */ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + // Simplified \(dqUnicode Identifier and Pattern Syntax\(dq + // (see https://unicode.org/reports/tr31) + id_start = L | Nl | [$_]; + id_continue = id_start | Mn | Mc | Nd | Pc | [\eu200D\eu05F3]; + identifier = id_start id_continue*; + + identifier { return true } + * { return false } + */ +} + +if (!lex(\(dq_Ыдентификатор\e0\(dq)) throw \(dqerror!\(dq + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include files +.sp +re2js allows one to include other files using a block of the form +\fB/*!include:re2c FILE */\fP or \fB%{include FILE %}\fP, or an in\-block directive +\fB!include FILE ;\fP, where \fBFILE\fP is a path to the file to be included. +re2js looks for include files in the directory of the including file and in +include locations, which can be specified with the \fB\-I\fP option. Include +blocks/directives in re2js work in the same way as C/C++ \fB#include\fP: \fBFILE\fP +contents are copy\-pasted verbatim in place of the block/directive. Include files +may have further includes of their own. Use \fB\-\-depfile\fP option to track build +dependencies of the output file on include files. +re2js provides some predefined include files that can be found in the +\fBinclude/\fP subdirectory of the project. These files contain definitions that +may be useful to other projects (such as Unicode categories) and form something +like a standard library for re2js\&. Below is an example of using include files. +.SS Include file 1 (definitions.js) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +const INT = 1 +const FLOAT = 2 +const NAN = 3 + +/*!re2c + number = [1\-9][0\-9]*; +*/ + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Include file 2 (extra_rules.re.inc) +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// floating\-point numbers +frac = [0\-9]* \(dq.\(dq [0\-9]+ | [0\-9]+ \(dq.\(dq; +exp = \(aqe\(aq [+\-]? [0\-9]+; +float = frac exp? | [0\-9]+ exp; + +float { return FLOAT } + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT + +/*!include:re2c \(dqdefinitions.js\(dq */ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + * { return NAN } + number { return INT } + !include \(dqextra_rules.re.inc\(dq; + */ +} + +function test(s, n) { + if (lex(s) != n) throw \(dqerror!\(dq +} + +test(\(dq123\e0\(dq, INT) +test(\(dq123.4567\e0\(dq, FLOAT) + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header files +.sp +re2js allows one to generate header file from the input \fB\&.re\fP file using +\fB\-\-header\fP option or \fBre2c:header\fP configuration and block pairs of the form +\fB/*!header:re2c:on*/\fP and \fB/*!header:re2c:off*/\fP, or \fB%{header:on%}\fP and +\fB%{header:off%}\fP\&. The first block marks the beginning of header file, and the +second block marks the end of it. Everything between these blocks is processed by +re2js, and the generated code is written to the file specified with \fB\-\-header\fP +option or \fBre2c:header\fP configuration (or \fBstdout\fP if neither option nor +configuration is used). Autogenerated header file may be needed in cases when +re2js is used to generate definitions that must be visible from other +translation units. +.sp +Here is an example of generating a header file that contains definition of the +lexer state with tag variables (the number variables depends on the regular +grammar and is unknown to the programmer). +.SS Input file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// re2js $INPUT \-o $OUTPUT \-\-header lexer/state.js + +let state = require(\(aq./lexer/state.js\(aq); + +/*!header:re2c:on*/ +exports.mk_state = function(str) { + return { + yyinput: str, + /*!stags:re2c format = \(dq@@: 0,\en\(dq; */ + yycursor: 0 + } +} +/*!header:re2c:off*/ + +function lex(yyrecord) { + let t + /*!re2c + re2c:api = record; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = \(dqlexer/state.js\(dq; + + [a]* @t [b]* { return t } + */ +} + +if (lex(state.mk_state(\(dqab\e0\(dq)) != 1) { + throw \(dqerror!\(dq +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Header file +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +// Generated by re2c + +exports.mk_state = function(str) { + return { + yyinput: str, + yyt1: 0, + + yycursor: 0 + } +} + +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Skeleton programs +.sp +With the \fB\-S, \-\-skeleton\fP option, re2js ignores all non\-re2js code and +generates a self\-contained C program that can be further compiled and executed. +The program consists of lexer code and input data. For each constructed DFA +(block or condition) re2js generates a standalone lexer and two files: an +\fB\&.input\fP file with strings derived from the DFA and a \fB\&.keys\fP file with +expected match results. The program runs each lexer on the corresponding +\fB\&.input\fP file and compares results with the expectations. +Skeleton programs are very useful for a number of reasons: +.INDENT 0.0 +.IP \(bu 2 +They can check correctness of various re2js optimizations (the data is +generated early in the process, before any DFA transformations have taken +place). +.IP \(bu 2 +Generating a set of input data with good coverage may be useful for both +testing and benchmarking. +.IP \(bu 2 +Generating self\-contained executable programs allows one to get minimized test +cases (the original code may be large or have a lot of dependencies). +.UNINDENT +.sp +The difficulty with generating input data is that for all but the most trivial +cases the number of possible input strings is too large (even if the string +length is limited). re2js solves this difficulty by generating sufficiently +many strings to cover almost all DFA transitions. It uses the following +algorithm. First, it constructs a skeleton of the DFA. For encodings with 1\-byte +code unit size (such as ASCII, UTF\-8 and EBCDIC) skeleton is just an exact copy +of the original DFA. For encodings with multibyte code units skeleton is a copy +of DFA with certain transitions omitted: namely, re2js takes at most 256 code +units for each disjoint continuous range that corresponds to a DFA transition. +The chosen values are evenly distributed and include range bounds. Instead of +trying to cover all possible paths in the skeleton (which is infeasible) re2js +generates sufficiently many paths to cover all skeleton transitions, and thus +trigger the corresponding conditional jumps in the lexer. +The algorithm implementation is limited by ~1Gb of transitions and consumes +constant amount of memory (re2js writes data to file as soon as it is +generated). +.SS Visualization and debug +.sp +With the \fB\-D, \-\-emit\-dot\fP option, re2js does not generate code. Instead, +it dumps the generated DFA in DOT format. +One can convert this dump to an image of the DFA using Graphviz or another library. +Note that this option shows the final DFA after it has gone through a number of +optimizations and transformations. Earlier stages can be dumped with various debug +options, such as \fB\-\-dump\-nfa\fP, \fB\-\-dump\-dfa\-raw\fP etc. (see the full list of options). +.SH SEE ALSO +.sp +You can find more information about re2c at the official website: \fI\%http://re2c.org\fP\&. +Similar programs are flex(1), lex(1), quex(\fI\%http://quex.sourceforge.net\fP). +.SH AUTHORS +.sp +re2js was originally written by Peter Bumbulis (\fI\%peter@csg.uwaterloo.ca\fP) in 1993. +Marcus Boerger and Dan Nuffer spent several years to turn the original idea into +a production ready code generator. Since then it has been maintained and +developed by multiple volunteers, most notably, +Brian Young (\fI\%bayoung@acm.org\fP), +\fI\%Marcus Boerger\fP, +Dan Nuffer (\fI\%nuffer@users.sourceforge.net\fP), +\fI\%Ulya Trofimovich\fP (\fI\%skvadrik@gmail.com\fP), +\fI\%Serghei Iakovlev\fP, +\fI\%Sergei Trofimovich\fP, +\fI\%Petr Skocik\fP, +\fI\%ligfx\fP +\fI\%raekye\fP +and \fI\%PolarGoose\fP\&. +.\" Generated by docutils manpage writer. +. diff --git a/bootstrap/src/default_syntax_php.h b/bootstrap/src/default_syntax_php.h new file mode 100644 index 000000000..f9d38a07a --- /dev/null +++ b/bootstrap/src/default_syntax_php.h @@ -0,0 +1,382 @@ +static constexpr const char* DEFAULT_SYNTAX_PHP = + "// supported feature lists -----------------------------------------------------\n" + "\n" + "supported_apis = [\"simple\", \"generic\", \"record\"];\n" + "supported_api_styles = [\"functions\", \"free-form\"];\n" + "supported_code_models = [\"goto-label\", \"loop-switch\", \"recursive-functions\"];\n" + "supported_targets = [\"code\", \"dot\", \"skeleton\"];\n" + "supported_features = [\"nested-ifs\", \"bitmaps\", \"computed-gotos\", \"case-ranges\",\n" + " \"tags\", \"captures\", \"captvars\"];\n" + "\n" + "\n" + "// language-specific options ---------------------------------------------------\n" + "\n" + "semicolons = 1;\n" + "backtick_quoted_strings = 0;\n" + "single_quoted_strings = 1;\n" + "indentation_sensitive = 0;\n" + "wrap_blocks_in_braces = 0;\n" + "\n" + "\n" + "// immutable configurations (command-line only options) ------------------------\n" + "\n" + "re2c:target = code;\n" + "re2c:code-model = loop-switch;\n" + "re2c:input-encoding = ascii;\n" + "re2c:date = 1;\n" + "re2c:version = 1;\n" + "re2c:conditions = 0;\n" + "re2c:storable-state = 0;\n" + "re2c:flex-syntax = 0;\n" + "re2c:verbose = 0;\n" + "re2c:line-dirs = 0;\n" + "\n" + "\n" + "// mutable configurations ------------------------------------------------------\n" + "\n" + "re2c:api = simple;\n" + "re2c:api:style = functions;\n" + "re2c:api:sigil = \"@@\";\n" + "re2c:YYGETCOND:naked = 0;\n" + "re2c:YYSETCOND:naked = 0;\n" + "re2c:YYSETCOND@cond = \"@@\";\n" + "re2c:YYGETSTATE:naked = 0;\n" + "re2c:YYSETSTATE:naked = 0;\n" + "re2c:YYSETSTATE@state = \"@@\";\n" + "re2c:YYFILL@len = \"@@\";\n" + "re2c:YYFILL:naked = 0;\n" + "re2c:YYFN = [\";\"];\n" + "re2c:yyfn:sep = \";\";\n" + "re2c:yycond = \"$yycond\";\n" + "re2c:yyctable = \"$yyctable\";\n" + "re2c:yyaccept = \"$yyaccept\";\n" + "re2c:yytarget = \"$yytarget\";\n" + "re2c:yystate = \"$yystate\";\n" + "re2c:yynmatch = \"$yynmatch\";\n" + "re2c:yypmatch = \"$yypmatch\";\n" + "re2c:yyrecord = \"$yyrecord\";\n" + "re2c:yych = \"$yych\";\n" + "re2c:yych:conversion = 0;\n" + "re2c:yych:literals = char-or-hex;\n" + "re2c:yych:emit = 1;\n" + "re2c:yybm = \"yybm\";\n" + "re2c:yybm:hex = 0;\n" + "re2c:yyfill = \"\";\n" + "re2c:yystable = \"\"; // deprecated\n" + "re2c:header = \"\";\n" + "re2c:eof = -1;\n" + "re2c:sentinel = -1;\n" + "re2c:yyfill:enable = 1;\n" + "re2c:yyfill:parameter = 1;\n" + "re2c:yyfill:check = 1;\n" + "re2c:tags = 0;\n" + "re2c:tags:prefix = \"yyt\";\n" + "re2c:captures = 0;\n" + "re2c:captvars = 0;\n" + "re2c:posix-captures = 0;\n" + "re2c:posix-captvars = 0;\n" + "re2c:invert-captures = 0;\n" + "re2c:cond:abort = 0;\n" + "re2c:cond:prefix = \"yyc_\";\n" + "re2c:cond:enumprefix = \"YYC_\";\n" + "re2c:cond:divider@cond = \"@@\";\n" + "re2c:cond:goto@cond = \"@@\";\n" + "re2c:state:abort = 1;\n" + "re2c:state:nextlabel = 0;\n" + "re2c:bit-vectors = 0;\n" + "re2c:debug-output = 0;\n" + "re2c:computed-gotos = 0;\n" + "re2c:computed-gotos:threshold = 9;\n" + "re2c:nested-ifs = 0;\n" + "re2c:case-insensitive = 0;\n" + "re2c:case-inverted = 0;\n" + "re2c:case-ranges = 0;\n" + "re2c:unsafe = 0;\n" + "re2c:monadic = 0;\n" + "re2c:encoding:ebcdic = 0;\n" + "re2c:encoding:utf32 = 0;\n" + "re2c:encoding:ucs2 = 0;\n" + "re2c:encoding:utf16 = 0;\n" + "re2c:encoding:utf8 = 0;\n" + "re2c:encoding-policy = ignore;\n" + "re2c:empty-class = match-empty;\n" + "re2c:indent:string = \" \";\n" + "re2c:indent:top = 0;\n" + "re2c:label:prefix = \"\";\n" + "re2c:label:yyfill = \"\";\n" + "re2c:label:yyloop = \"yyl\";\n" + "re2c:label:yyNext = \"\";\n" + "re2c:label:start = 0;\n" + "\n" + "\n" + "// mutable code configuration --------------------------------------------------\n" + "\n" + "re2c:YYBACKUP = \"$yybackup\";\n" + "re2c:YYBACKUPCTX = \"$yybackupctx\";\n" + "re2c:YYCONDTYPE = \"$YYCond\";\n" + "re2c:YYCOPYMTAG = sigil \"{lhs} = \" sigil \"{rhs}\";\n" + "re2c:YYCOPYSTAG = sigil \"{lhs} = \" sigil \"{rhs}\";\n" + "re2c:YYCTYPE = \"$YYChar\";\n" + "re2c:YYCTXMARKER = (.api.record ? yyrecord \"->yyctxmarker\" : \"$yyctxmarker\");\n" + "re2c:YYCURSOR = (.api.record ? yyrecord \"->yycursor\" : \"$yycursor\");\n" + "re2c:YYDEBUG = \"yydebug\";\n" + "re2c:YYFILL = \"YYFILL\";\n" + "re2c:YYGETACCEPT = sigil \"{var}\";\n" + "re2c:YYGETCOND = \"yygetcond\";\n" + "re2c:YYGETSTATE = \"yygetstate\";\n" + "re2c:YYINPUT = (.api.record ? yyrecord \"->yyinput\" : \"$yyinput\");\n" + "re2c:YYLESSTHAN = \"yylessthan\";\n" + "re2c:YYLIMIT = (.api.record ? yyrecord \"->yylimit\" : \"$yylimit\");\n" + "re2c:YYMARKER = (.api.record ? yyrecord \"->yymarker\" : \"$yymarker\");\n" + "re2c:YYMAXFILL = \"YYMAXFILL\";\n" + "re2c:YYMAXNMATCH = \"$YYMAXNMATCH\";\n" + "re2c:YYMTAGN = \"$yymtagn\";\n" + "re2c:YYMTAGP = \"$yymtagp\";\n" + "re2c:YYPEEK = (.api.generic ? \"$yypeek\" : \"\");\n" + "re2c:YYRESTORE = \"$yyrestore\";\n" + "re2c:YYRESTORECTX = \"$yyrestorectx\";\n" + "re2c:YYRESTORETAG = \"$yyrestoretag\";\n" + "re2c:YYSETACCEPT = sigil \"{var} = \" sigil \"{val}\";\n" + "re2c:YYSETCOND = \"$yysetcond\";\n" + "re2c:YYSETSTATE = \"$yysetstate\";\n" + "re2c:YYSHIFT = \"$yyshift\";\n" + "re2c:YYSHIFTSTAG = \"$yyshiftstag\";\n" + "re2c:YYSHIFTMTAG = \"$yyshiftmtag\";\n" + "re2c:YYSKIP = \"$yyskip\";\n" + "re2c:YYSTAGN = \"$yystagn\";\n" + "re2c:YYSTAGP = \"$yystagp\";\n" + "re2c:tags:expression = (.api.record ? yyrecord \"->\") sigil;\n" + "re2c:tags:negative = (.api.generic ? \"@@\" : \"-1\");\n" + "re2c:cond:divider = \"\";\n" + "re2c:cond:goto = \"goto \" sigil \";\";\n" + "\n" + "\n" + "// code templates --------------------------------------------------------------\n" + "\n" + "code:var_local = topindent name \" = \" init \";\" nl;\n" + "code:var_global = code:var_local;\n" + "\n" + "code:const_local = topindent \"const \" name \" = \" init \";\" nl;\n" + "code:const_global = code:const_local;\n" + "\n" + "code:array_local =\n" + " topindent \"const \" name \" = [\" nl indent\n" + " [row: topindent [elem{0:-2}: elem \", \"] [elem{-1}: elem \",\"] nl]\n" + " dedent topindent \"];\" nl;\n" + "\n" + "code:array_global = ;\n" + "\n" + "code:array_elem = array \"[\" index \"]\";\n" + "\n" + "code:enum = [elem: topindent \"const \" elem \" = \" init \";\" nl];\n" + "\n" + "code:enum_elem = name;\n" + "\n" + "code:assign = topindent lhs \" = \" rhs \";\" nl;\n" + "\n" + "code:type_int = \"int\";\n" + "code:type_uint = \"int\";\n" + "code:type_yybm = \"string\";\n" + "code:type_yytarget = ;\n" + "\n" + "code:cmp_eq = \"===\";\n" + "code:cmp_ne = \"!==\";\n" + "code:cmp_lt = \"<\";\n" + "code:cmp_gt = \">\";\n" + "code:cmp_le = \"<=\";\n" + "code:cmp_ge = \">=\";\n" + "\n" + "code:if_then_else =\n" + " [branch{0}: topindent \"if (\" cond \") {\" nl\n" + " indent [stmt: stmt] dedent]\n" + " [branch{1:-1}: topindent \"} else \" (.cond ? \"if (\" cond \") \") \"{\" nl\n" + " indent [stmt: stmt] dedent]\n" + " topindent \"}\" nl;\n" + "\n" + "code:if_then_else_oneline = ;\n" + "\n" + "code:switch =\n" + " topindent \"switch (\" expr \") {\" nl\n" + " indent [case: case] dedent\n" + " topindent \"}\" nl;\n" + "\n" + "code:switch_cases =\n" + " [case: case nl]\n" + " indent [stmt: stmt] dedent;\n" + "\n" + "code:switch_cases_oneline = ;\n" + "\n" + "code:switch_case_range =\n" + " [val{0:-2}: topindent \"case \" val \":\" nl]\n" + " [val{-1}: topindent \"case \" val \":\"];\n" + "\n" + "code:switch_case_default =\n" + " topindent \"default:\";\n" + "\n" + "code:loop =\n" + " topindent \"while (true) {\" nl\n" + " indent [stmt: stmt] dedent\n" + " topindent \"}\" nl;\n" + "\n" + "code:continue = topindent \"break 2;\" nl;\n" + "\n" + "code:goto = topindent \"goto \" label \";\" nl;\n" + "\n" + "code:fndecl = ;\n" + "code:fndef = ;\n" + "code:fncall = ;\n" + "code:tailcall = ;\n" + "code:recursive_functions = ;\n" + "\n" + "code:line_info = ;\n" + "\n" + "code:fingerprint = \"\";\n" + "code:abort = topindent \"throw new \\\\Exception(\\\"internal lexer error\\\");\" nl;\n" + "\n" + "code:yydebug =\n" + " topindent (.api.generic\n" + " ? YYDEBUG\n" + " : (.api.record\n" + " ? YYDEBUG \"(\" yyrecord \");\"\n" + " : YYDEBUG \"(\" state \", \" yych \");\"\n" + " )) nl;\n" + "\n" + "code:yypeek =\n" + " topindent (.code_model.recursive_functions ? YYCTYPE \" \") yych \" = \" (.api.generic\n" + " ? YYPEEK\n" + " : YYPEEK YYINPUT \"[\" YYCURSOR \"];\"\n" + " ) nl;\n" + "\n" + "code:yyskip =\n" + " topindent (.api.generic\n" + " ? YYSKIP\n" + " : YYCURSOR \" += 1;\"\n" + " ) nl;\n" + "\n" + "code:yybackup =\n" + " topindent (.api.generic\n" + " ? YYBACKUP\n" + " : YYMARKER \" = \" YYCURSOR \";\"\n" + " ) nl;\n" + "\n" + "code:yybackupctx =\n" + " topindent (.api.generic\n" + " ? YYBACKUPCTX\n" + " : YYCTXMARKER \" = \" YYCURSOR \";\"\n" + " ) nl;\n" + "\n" + "code:yyskip_yypeek = ;\n" + "code:yypeek_yyskip = ;\n" + "code:yyskip_yybackup = ;\n" + "code:yybackup_yyskip = ;\n" + "code:yybackup_yypeek = ;\n" + "code:yyskip_yybackup_yypeek = ;\n" + "code:yybackup_yypeek_yyskip = ;\n" + "\n" + "code:yyrestore =\n" + " topindent (.api.generic\n" + " ? YYRESTORE\n" + " : YYCURSOR \" = \" YYMARKER \";\"\n" + " ) nl;\n" + "\n" + "code:yyrestorectx =\n" + " topindent (.api.generic\n" + " ? YYRESTORECTX\n" + " : YYCURSOR \" = \" YYCTXMARKER \";\"\n" + " ) nl;\n" + "\n" + "code:yyrestoretag =\n" + " topindent (.api.generic\n" + " ? YYRESTORETAG\n" + " : YYCURSOR \" = \" tag \";\"\n" + " ) nl;\n" + "\n" + "code:yyshift =\n" + " topindent (.api.generic\n" + " ? YYSHIFT\n" + " : YYCURSOR \" -= \" offset \";\"\n" + " ) nl;\n" + "\n" + "code:yyshiftstag =\n" + " topindent (.nested ? \"if (\" tag \" != \" neg \") \") (.api.generic\n" + " ? YYSHIFTSTAG\n" + " : tag \" -= \" offset \";\"\n" + " ) nl;\n" + "\n" + "code:yyshiftmtag =\n" + " topindent YYSHIFTMTAG nl;\n" + "\n" + "code:yystagp =\n" + " topindent (.api.generic\n" + " ? YYSTAGP\n" + " : tag \" = \" YYCURSOR \";\"\n" + " ) nl;\n" + "\n" + "code:yymtagp =\n" + " topindent YYMTAGP nl;\n" + "\n" + "code:yystagn =\n" + " topindent (.api.generic\n" + " ? YYSTAGN\n" + " : tag \" = \" neg \";\"\n" + " ) nl;\n" + "\n" + "code:yymtagn =\n" + " topindent YYMTAGN nl;\n" + "\n" + "code:yycopystag =\n" + " topindent (.api.generic\n" + " ? YYCOPYSTAG\n" + " : lhs \" = \" rhs \";\"\n" + " ) nl;\n" + "\n" + "code:yycopymtag =\n" + " topindent (.api.generic\n" + " ? YYCOPYMTAG\n" + " : lhs \" = \" rhs \";\"\n" + " ) nl;\n" + "\n" + "code:yygetaccept =\n" + " (.api.generic\n" + " ? YYGETACCEPT\n" + " : (.api.record & .storable_state ? yyrecord \".\") var);\n" + "\n" + "code:yysetaccept =\n" + " topindent (.api.generic\n" + " ? YYSETACCEPT\n" + " : (.api.record & .storable_state ? yyrecord \".\") var \" = \" val \";\"\n" + " ) nl;\n" + "\n" + "code:yygetcond =\n" + " (.api.generic\n" + " ? YYGETCOND\n" + " : (.api.record ? yyrecord \".\") var);\n" + "\n" + "code:yysetcond =\n" + " topindent (.api.generic\n" + " ? YYSETCOND\n" + " : (.api.record ? yyrecord \".\") var \" = \" val \";\"\n" + " ) nl;\n" + "\n" + "code:yygetstate =\n" + " (.api.generic\n" + " ? YYGETSTATE\n" + " : (.api.record ? yyrecord \".\") var);\n" + "\n" + "code:yysetstate =\n" + " topindent (.api.generic\n" + " ? YYSETSTATE\n" + " : (.api.record ? yyrecord \".\") var \" = \" val \";\"\n" + " ) nl;\n" + "\n" + "code:yylessthan =\n" + " (.api.generic\n" + " ? YYLESSTHAN\n" + " : (.many\n" + " ? \"(\" YYLIMIT \" - \" YYCURSOR \") < \" need\n" + " : YYLIMIT \" <= \" YYCURSOR));\n" + "\n" + "code:yybm_filter = yych \" & ~0xFF\";\n" + "\n" + "code:yybm_match = \"(\" yybm \"[\" offset \"+\" yych \"] & \" mask \") !== 0\";\n" + ; diff --git a/bootstrap/src/msg/help_re2php.cc b/bootstrap/src/msg/help_re2php.cc new file mode 100644 index 000000000..8b6eccf92 --- /dev/null +++ b/bootstrap/src/msg/help_re2php.cc @@ -0,0 +1,512 @@ +extern const char* help; +const char* help = +"USAGE\n" +"\n" +" re2js [ OPTIONS ] [ WARNINGS ] INPUT\n" +"\n" +" Input can be either a file or - for stdin.\n" +"\n" +"OPTIONS\n" +"\n" +" -? --help -h\n" +"\n" +" Show help message.\n" +"\n" +" --api \n" +"\n" +" Specify the API used by the generated code to interface with used-\n" +" defined code. Option simple shold be used in simple cases when there's\n" +" no need for buffer refilling and storing lexer state. Option record\n" +" should be used when lexer state needs to be stored in a record (struct,\n" +" class, etc.). Option generic should be used in complex cases when the\n" +" other two APIs are not flexible enough.\n" +"\n" +" --bit-vectors -b\n" +"\n" +" Optimize conditional jumps using bit masks. This option implies\n" +" --nested-ifs.\n" +"\n" +" --captures, --leftmost-captures\n" +"\n" +" Enable submatch extraction with leftmost greedy capturing groups. The\n" +" result is collected into an array yybmatch of capacity 2 * YYMAXNMATCH,\n" +" and yynmatch is set to the number of groups for the matching rule.\n" +"\n" +" --captvars, --leftmost-captvars\n" +"\n" +" Enable submatch extraction with leftmost greedy capturing groups. The\n" +" result is collected into variables yytl, yytr for k-th capturing\n" +" group.\n" +"\n" +" --case-insensitive\n" +"\n" +" Treat single-quoted and double-quoted strings as case-insensitive.\n" +"\n" +" --case-inverted\n" +"\n" +" Invert the meaning of single-quoted and double-quoted strings: treat\n" +" single-quoted strings as case-sensitive and double-quoted strings as\n" +" case-insensitive.\n" +"\n" +" --case-ranges\n" +"\n" +" Collapse consecutive cases in a switch statements into a range of the\n" +" form low ... high. This syntax is a C/C++ language extension that is\n" +" supported by compilers like GCC, Clang and Tcc. The main advantage over\n" +" using single cases is smaller generated code and faster generation\n" +" time, although for some compilers like Tcc it also results in smaller\n" +" binary size. This option is supported only for C.\n" +"\n" +" --computed-gotos -g\n" +"\n" +" Optimize conditional jumps using non-standard \"computed goto\" extension\n" +" (which must be supported by the compiler). re2js generates jump tables\n" +" only in complex cases with a lot of conditional branches. Complexity\n" +" threshold can be configured with cgoto:threshold configuration. This\n" +" option implies --bit-vectors. It is supported only for C.\n" +"\n" +" --conditions --start-conditions -c\n" +"\n" +" Enable support of Flex-like \"conditions\": multiple interrelated lexers\n" +" within one block. This is an alternative to manually specifying\n" +" different re2js blocks connected with goto or function calls.\n" +"\n" +" --depfile FILE\n" +"\n" +" Write dependency information to FILE in the form of a Makefile rule\n" +" : [include-file ...]. This allows one to\n" +" track build dependencies in the presence of include blocks/directives,\n" +" so that updating include files triggers regeneration of the output\n" +" file. This option depends on the --output option.\n" +"\n" +" --ebcdic --ecb -e\n" +"\n" +" Generate a lexer that reads input in EBCDIC encoding. re2js assumes\n" +" that the character range is 0 -- 0xFF and character size is 1 byte.\n" +"\n" +" --empty-class \n" +"\n" +" Define the way re2js treats empty character classes. With match-empty\n" +" (the default) empty class matches empty input (which is illogical, but\n" +" backwards-compatible). With match-none empty class always fails to\n" +" match. With error empty class raises a compilation error.\n" +"\n" +" --encoding-policy \n" +"\n" +" Define the way re2js treats Unicode surrogates. With fail re2js aborts\n" +" with an error when a surrogate is encountered. With substitute re2js\n" +" silently replaces surrogates with the error code point 0xFFFD. With\n" +" ignore (the default) re2js treats surrogates as normal code points. The\n" +" Unicode standard says that standalone surrogates are invalid, but real-\n" +" world libraries and programs behave in different ways.\n" +"\n" +" --flex-syntax -F\n" +"\n" +" Partial support for Flex syntax: in this mode named definitions don't\n" +" need the equal sign and the terminating semicolon, and when used they\n" +" must be surrounded with curly braces. Names without curly braces are\n" +" treated as double-quoted strings.\n" +"\n" +" --goto-label\n" +"\n" +" Use \"goto/label\" code model: encode DFA in form of labeled code blocks\n" +" connected with goto transitions across blocks. This is only supported\n" +" for languages that have a goto statement.\n" +"\n" +" --header --type-header -t HEADER\n" +"\n" +" Generate a HEADER file. The contents of the file can be specified using\n" +" special blocks header:on and header:off. If conditions are used, the\n" +" generated header will have a condition enum automatically appended to\n" +" it (unless there is an explicit conditions block).\n" +"\n" +" -I PATH\n" +"\n" +" Add PATH to the list of locations which are used when searching for\n" +" include files. This option is useful in combination with include block\n" +" or directive. re2js looks for FILE in the directory of the parent file\n" +" and in the include locations specified with -I option.\n" +"\n" +" --input \n" +"\n" +" Deprecated alias for --api. Option default corresponds to simple (it is\n" +" indeed the default for most backends, but not for all). Option custom\n" +" corresponds to generic.\n" +"\n" +" --input-encoding \n" +"\n" +" Specify the way re2js parses regular expressions. With ascii (the\n" +" default) re2js handles input as ASCII-encoded: any sequence of code\n" +" units is a sequence of standalone 1-byte characters. With utf8 re2js\n" +" handles input as UTF8-encoded and recognizes multibyte characters.\n" +"\n" +" --invert-captures\n" +"\n" +" Invert the meaning of capturing and non-capturing groups. By default\n" +" (...) is capturing and (! ...) is non-capturing. With this option (!\n" +" ...) is capturing and (...) is non-capturing.\n" +"\n" +" --lang \n" +"\n" +" Specify the target language. Supported languages are C, D, Go, Haskell,\n" +" Java, JS, OCaml, Python, Rust, V, Zig (more languages can be added via\n" +" user-defined syntax files, see the --syntax option). Option none\n" +" disables default suntax configs, so that the target language is\n" +" undefined.\n" +"\n" +" --location-format \n" +"\n" +" Specify location format in messages. With gnu locations are printed as\n" +" 'filename:line:column: ...'. With msvc locations are printed as\n" +" 'filename(line,column) ...'. The default is gnu.\n" +"\n" +" --loop-switch\n" +"\n" +" Use \"loop/switch\" code model: encode DFA in form of a loop over a\n" +" switch statement, where individual states are switch cases. State is\n" +" stored in a variable yystate. Transitions between states update yystate\n" +" to the case label of the destination state and continue execution to\n" +" the head of the loop.\n" +"\n" +" --nested-ifs -s\n" +"\n" +" Use nested if statements instead of switch statements in conditional\n" +" jumps. This usually results in more efficient code with non-optimizing\n" +" compilers.\n" +"\n" +" --no-debug-info -i\n" +"\n" +" Do not output line directives. This may be useful when the generated\n" +" code is stored in a version control system (to avoid huge autogenerated\n" +" diffs on small changes).\n" +"\n" +" --no-generation-date\n" +"\n" +" Suppress date output in the generated file.\n" +"\n" +" --no-version\n" +"\n" +" Suppress version output in the generated file.\n" +"\n" +" --no-unsafe\n" +"\n" +" Do not generate unsafe wrapper over YYPEEK (this option is specific to\n" +" Rust). For performance reasons YYPEEK should avoid bounds-checking, as\n" +" the lexer already performs end-of-input checks in a more efficient way.\n" +" The user may choose to provide a safe YYPEEK definition, or a\n" +" definition that is unsafe only in release builds, in which case the\n" +" --no-unsafe option helps to avoid warnings about redundant unsafe\n" +" blocks.\n" +"\n" +" --output -o OUTPUT\n" +"\n" +" Specify the OUTPUT file.\n" +"\n" +" --posix-captures, -P\n" +"\n" +" Enable submatch extraction with POSIX-style capturing groups. The\n" +" result is collected into an array yybmatch of capacity 2 * YYMAXNMATCH,\n" +" and yynmatch is set to the number of groups for the matching rule.\n" +"\n" +" --posix-captvars\n" +"\n" +" Enable submatch extraction with POSIX-style capturing groups. The\n" +" result is collected into variables yytl, yytr for k-th capturing\n" +" group.\n" +"\n" +" --recursive-functions\n" +"\n" +" Use code model based on co-recursive functions, where each DFA state is\n" +" a separate function that may call other state-functions or itself.\n" +"\n" +" --reusable -r\n" +"\n" +" Deprecated since version 2.2 (reusable blocks are allowed by default\n" +" now).\n" +"\n" +" --skeleton -S\n" +"\n" +" Ignore user-defined interface code and generate a self-contained\n" +" \"skeleton\" program. Additionally, generate input files with strings\n" +" derived from the regular grammar and compressed match results that are\n" +" used to verify \"skeleton\" behavior on all inputs. This option is useful\n" +" for finding bugs in optimizations and code generation. This option is\n" +" supported only for C.\n" +"\n" +" --storable-state -f\n" +"\n" +" Generate a lexer which can store its inner state. This is useful in\n" +" push-model lexers which are stopped by an outer program when there is\n" +" not enough input, and then resumed when more input becomes available.\n" +" In this mode users should additionally define YYGETSTATE and YYSETSTATE\n" +" primitives, and variables yych, yyaccept and state should be part of\n" +" the stored lexer state.\n" +"\n" +" --syntax FILE\n" +"\n" +" Load configurations from the specified FILE and apply them on top of\n" +" the default syntax file. Note that FILE can define only a few\n" +" configurations (if it's used to amend the default syntax file), or it\n" +" can define a whole new language backend (in the latter case it is\n" +" recommended to use --lang none option).\n" +"\n" +" --tags -T\n" +"\n" +" Enable submatch extraction with tags.\n" +"\n" +" --ucs2 --wide-chars -w\n" +"\n" +" Generate a lexer that reads UCS2-encoded input. re2js assumes that the\n" +" character range is 0 -- 0xFFFF and character size is 2 bytes. This\n" +" option implies --nested-ifs.\n" +"\n" +" --utf8 --utf-8 -8\n" +"\n" +" Generate a lexer that reads input in UTF-8 encoding. re2js assumes that\n" +" the character range is 0 -- 0x10FFFF and character size is 1 byte.\n" +"\n" +" --utf16 --utf-16 -x\n" +"\n" +" Generate a lexer that reads UTF16-encoded input. re2js assumes that the\n" +" character range is 0 -- 0x10FFFF and character size is 2 bytes. This\n" +" option implies --nested-ifs.\n" +"\n" +" --utf32 --unicode -u\n" +"\n" +" Generate a lexer that reads UTF32-encoded input. re2js assumes that the\n" +" character range is 0 -- 0x10FFFF and character size is 4 bytes. This\n" +" option implies --nested-ifs.\n" +"\n" +" --verbose\n" +"\n" +" Output a short message in case of success.\n" +"\n" +" --vernum -V\n" +"\n" +" Show version information in MMmmpp format (major, minor, patch).\n" +"\n" +" --version -v\n" +"\n" +" Show version information.\n" +"\n" +" --single-pass -1\n" +"\n" +" Deprecated. Does nothing (single pass is the default now).\n" +"\n" +" --debug-output -d\n" +"\n" +" Emit YYDEBUG invocations in the generated code. This is useful to trace\n" +" lexer execution.\n" +"\n" +" --dump-adfa\n" +"\n" +" Debug option: output DFA after tunneling (in .dot format).\n" +"\n" +" --dump-cfg\n" +"\n" +" Debug option: output control flow graph of tag variables (in .dot\n" +" format).\n" +"\n" +" --dump-closure-stats\n" +"\n" +" Debug option: output statistics on the number of states in closure.\n" +"\n" +" --dump-dfa-det\n" +"\n" +" Debug option: output DFA immediately after determinization (in .dot\n" +" format).\n" +"\n" +" --dump-dfa-min\n" +"\n" +" Debug option: output DFA after minimization (in .dot format).\n" +"\n" +" --dump-dfa-tagopt\n" +"\n" +" Debug option: output DFA after tag optimizations (in .dot format).\n" +"\n" +" --dump-dfa-tree\n" +"\n" +" Debug option: output DFA under construction with states represented as\n" +" tag history trees (in .dot format).\n" +"\n" +" --dump-dfa-raw\n" +"\n" +" Debug option: output DFA under construction with expanded state-sets\n" +" (in .dot format).\n" +"\n" +" --dump-interf\n" +"\n" +" Debug option: output interference table produced by liveness analysis\n" +" of tag variables.\n" +"\n" +" --dump-nfa\n" +"\n" +" Debug option: output NFA (in .dot format).\n" +"\n" +" --emit-dot -D\n" +"\n" +" Instead of normal output generate lexer graph in .dot format. The\n" +" output can be converted to an image with the help of Graphviz (e.g.\n" +" something like dot -Tpng -odfa.png dfa.dot).\n" +"\n" +" --dfa-minimization \n" +"\n" +" Internal option: DFA minimization algorithm used by re2js. The moore\n" +" option is the Moore algorithm (it is the default). The table option is\n" +" the \"table filling\" algorithm. Both algorithms should produce the same\n" +" DFA up to states relabeling; table filling is simpler and much slower\n" +" and serves as a reference implementation.\n" +"\n" +" --eager-skip\n" +"\n" +" Internal option: make the generated lexer advance the input position\n" +" eagerly -- immediately after reading the input symbol. This changes the\n" +" default behavior when the input position is advanced lazily -- after\n" +" transition to the next state.\n" +"\n" +" --no-lookahead\n" +"\n" +" Internal option, deprecated. It used to enable TDFA(0) algorithm.\n" +" Unlike TDFA(1), TDFA(0) algorithm does not use one-symbol lookahead. It\n" +" applies register operations to the incoming transitions rather than the\n" +" outgoing ones. Benchmarks showed that TDFA(0) algorithm is less\n" +" efficient than TDFA(1).\n" +"\n" +" --no-optimize-tags\n" +"\n" +" Internal option: suppress optimization of tag variables (useful for\n" +" debugging).\n" +"\n" +" --posix-closure \n" +"\n" +" Internal option: specify shortest-path algorithm used for the\n" +" construction of epsilon-closure with POSIX disambiguation semantics:\n" +" gor1 (the default) stands for Goldberg-Radzik algorithm, and gtop\n" +" stands for \"global topological order\" algorithm.\n" +"\n" +" --posix-prectable \n" +"\n" +" Internal option: specify the algorithm used to compute POSIX precedence\n" +" table. The complex algorithm computes precedence table in one traversal\n" +" of tag history tree and has quadratic complexity in the number of TNFA\n" +" states; it is the default. The naive algorithm has worst-case cubic\n" +" complexity in the number of TNFA states, but it is much simpler than\n" +" complex and may be slightly faster in non-pathological cases.\n" +"\n" +" --stadfa\n" +"\n" +" Internal option, deprecated. It used to enable staDFA algorithm, which\n" +" differs from TDFA in that register operations are placed in states\n" +" rather than on transitions. Benchmarks showed that staDFA algorithm is\n" +" less efficient than TDFA.\n" +"\n" +" --fixed-tags \n" +"\n" +" Internal option: specify whether the fixed-tag optimization should be\n" +" applied to all tags (all), none of them (none), or only those in\n" +" toplevel concatenation (toplevel). The default is all. \"Fixed\" tags are\n" +" those that are located within a fixed distance to some other tag\n" +" (called \"base\"). In such cases only the base tag needs to be tracked,\n" +" and the value of the fixed tag can be computed as the value of the base\n" +" tag plus a static offset. For tags that are under alternative or\n" +" repetition it is also necessary to check if the base tag has a no-match\n" +" value (in that case fixed tag should also be set to no-match,\n" +" disregarding the offset). For tags in top-level concatenation the check\n" +" is not needed, because they always match.\n" +"\n" +"WARNINGS\n" +"\n" +" -W\n" +"\n" +" Turn on all warnings.\n" +"\n" +" -Werror\n" +"\n" +" Turn warnings into errors. Note that this option alone doesn't turn on\n" +" any warnings; it only affects those warnings that have been turned on\n" +" so far or will be turned on later.\n" +"\n" +" -W\n" +"\n" +" Turn on warning.\n" +"\n" +" -Wno-\n" +"\n" +" Turn off warning.\n" +"\n" +" -Werror-\n" +"\n" +" Turn on warning and treat it as an error (this implies -W).\n" +"\n" +" -Wno-error-\n" +"\n" +" Don't treat this particular warning as an error. This doesn't turn off\n" +" the warning itself.\n" +"\n" +" -Wcondition-order\n" +"\n" +" Warn if the generated program makes implicit assumptions about\n" +" condition numbering. One should use either --header option or\n" +" conditions block to generate a mapping of condition names to numbers\n" +" and then use the autogenerated condition names.\n" +"\n" +" -Wempty-character-class\n" +"\n" +" Warn if a regular expression contains an empty character class. Trying\n" +" to match an empty character class makes no sense: it should always\n" +" fail. However, for backwards compatibility reasons re2js permits empty\n" +" character classes and treats them as empty strings. Use the --empty-\n" +" class option to change the default behavior.\n" +"\n" +" -Wmatch-empty-string\n" +"\n" +" Warn if a rule is nullable (matches an empty string). If the lexer runs\n" +" in a loop and the empty match is unintentional, the lexer may\n" +" unexpectedly hang in an infinite loop.\n" +"\n" +" -Wswapped-range\n" +"\n" +" Warn if the lower bound of a range is greater than its upper bound. The\n" +" default behavior is to silently swap the range bounds.\n" +"\n" +" -Wundefined-control-flow\n" +"\n" +" Warn if some input strings cause undefined control flow in the lexer\n" +" (the faulty patterns are reported). This is a dangerous and common\n" +" mistake. It can be easily fixed by adding the default rule * which has\n" +" the lowest priority, matches any code unit, and always consumes a\n" +" single code unit.\n" +"\n" +" -Wunreachable-rules\n" +"\n" +" Warn about rules that are shadowed by other rules and will never match.\n" +"\n" +" -Wuseless-escape\n" +"\n" +" Warn if a symbol is escaped when it shouldn't be. By default, re2js\n" +" silently ignores such escapes, but this may as well indicate a typo or\n" +" an error in the escape sequence.\n" +"\n" +" -Wnondeterministic-tags\n" +"\n" +" Warn if a tag has n-th degree of nondeterminism, where n is greater\n" +" than 1.\n" +"\n" +" -Wsentinel-in-midrule\n" +"\n" +" Warn if the sentinel symbol occurs in the middle of a rule --- this may\n" +" cause reads past the end of buffer, crashes or memory corruption in the\n" +" generated lexer. This warning is only applicable if the sentinel method\n" +" of checking for the end of input is used. It is set to an error if\n" +" re2c:sentinel configuration is used.\n" +"\n" +" -Wundefined-syntax-config\n" +"\n" +" Warn if the syntax file specified with --syntax option is missing\n" +" definitions of some configurations. This helps to maintain user-defined\n" +" syntax files: if a new release adds configurations, old syntax file\n" +" will raise a warning, and the user will be notified. If some\n" +" configurations are unused and do not need a definition, they should be\n" +" explicitly set to .\n" +; diff --git a/bootstrap/src/parse/conf_parser.cc b/bootstrap/src/parse/conf_parser.cc index 996887ad0..0e5af6504 100644 --- a/bootstrap/src/parse/conf_parser.cc +++ b/bootstrap/src/parse/conf_parser.cc @@ -1597,6 +1597,7 @@ Ret Input::load_syntax_config(Opt& opts, Lang lang) { case Lang::JAVA: src = DEFAULT_SYNTAX_JAVA; break; case Lang::JS: src = DEFAULT_SYNTAX_JS; break; case Lang::OCAML: src = DEFAULT_SYNTAX_OCAML; break; + case Lang::PHP: src = DEFAULT_SYNTAX_PHP; break; case Lang::PYTHON: src = DEFAULT_SYNTAX_PYTHON; break; case Lang::RUST: src = DEFAULT_SYNTAX_RUST; break; case Lang::V: src = DEFAULT_SYNTAX_V; break; diff --git a/bootstrap/src/parse/conf_parser.h b/bootstrap/src/parse/conf_parser.h index caf3a7622..1ee022d46 100644 --- a/bootstrap/src/parse/conf_parser.h +++ b/bootstrap/src/parse/conf_parser.h @@ -66,6 +66,7 @@ extern int conf_debug; #include "src/default_syntax_java.h" #include "src/default_syntax_js.h" #include "src/default_syntax_ocaml.h" +#include "src/default_syntax_php.h" #include "src/default_syntax_python.h" #include "src/default_syntax_rust.h" #include "src/default_syntax_v.h" diff --git a/build/split_man.py b/build/split_man.py index e83223bdf..a396a48b7 100644 --- a/build/split_man.py +++ b/build/split_man.py @@ -46,6 +46,9 @@ elif lang == b'ocaml': src_ext = b'ml' lang_name = b'OCaml' +elif lang == b'php': + src_ext = b'php' + lang_name = b'PHP' elif lang == b'py': lang = b'python' src_ext = b'py' diff --git a/configure.ac b/configure.ac index 3a0b66bec..b0841af87 100644 --- a/configure.ac +++ b/configure.ac @@ -92,6 +92,9 @@ AM_CONDITIONAL([WITH_JS], [test "x$enable_js" != "xno"]) # --enable-ocaml AC_ARG_ENABLE([ocaml], [AS_HELP_STRING([--enable-ocaml], [build re2ocaml executable])]) AM_CONDITIONAL([WITH_OCAML], [test "x$enable_ocaml" != "xno"]) +# --enable-php +AC_ARG_ENABLE([php], [AS_HELP_STRING([--enable-php], [build re2php executable])]) +AM_CONDITIONAL([WITH_PHP], [test "x$enable_php" != "xno"]) # --enable-python AC_ARG_ENABLE([python], [AS_HELP_STRING([--enable-python], [build re2py executable])]) AM_CONDITIONAL([WITH_PYTHON], [test "x$enable_python" != "xno"]) @@ -224,6 +227,7 @@ AC_CONFIG_FILES([ src/msg/help_re2java.rst:doc/help.rst.in src/msg/help_re2js.rst:doc/help.rst.in src/msg/help_re2ocaml.rst:doc/help.rst.in + src/msg/help_re2php.rst:doc/help.rst.in src/msg/help_re2py.rst:doc/help.rst.in src/msg/help_re2rust.rst:doc/help.rst.in src/msg/help_re2v.rst:doc/help.rst.in @@ -236,6 +240,7 @@ AC_CONFIG_FILES([ doc/re2java.rst:doc/manpage.rst.in doc/re2js.rst:doc/manpage.rst.in doc/re2ocaml.rst:doc/manpage.rst.in + doc/re2php.rst:doc/manpage.rst.in doc/re2py.rst:doc/manpage.rst.in doc/re2rust.rst:doc/manpage.rst.in doc/re2v.rst:doc/manpage.rst.in @@ -251,6 +256,7 @@ AC_CONFIG_LINKS([ src/default_syntax_java.stx:include/syntax/java src/default_syntax_js.stx:include/syntax/js src/default_syntax_ocaml.stx:include/syntax/ocaml + src/default_syntax_php.stx:include/syntax/php src/default_syntax_python.stx:include/syntax/python src/default_syntax_rust.stx:include/syntax/rust src/default_syntax_v.stx:include/syntax/v diff --git a/examples/php/01_basic.php b/examples/php/01_basic.php new file mode 100644 index 000000000..fdab0926c --- /dev/null +++ b/examples/php/01_basic.php @@ -0,0 +1,63 @@ + "$phptest" + + # If the autogenerated message appears more than once in the file, then + # it must have autogenerated header appended at the end. Cut it off. + msg='Generated by re2php' + if [ $(grep -c "$msg" "$phptest") -gt 1 ]; then + # Get the line of the second message occurrence. + l=$(grep -n "$msg" "$phptest" | tail -n +2 | cut -d : -f 1) + # Cut off everything past that line. + head -n $l "$phptest" > "$phptest".mod && mv "$phptest".mod "$phptest" + fi + + echo "$f" + php "$phptest" || { echo "*** error ***"; exit 1; } + + rm -f "$phptest" + cd $root_dir +done + +echo "All good." diff --git a/examples/php/conditions/parse_u32_blocks.php b/examples/php/conditions/parse_u32_blocks.php new file mode 100644 index 000000000..8d7c4ea83 --- /dev/null +++ b/examples/php/conditions/parse_u32_blocks.php @@ -0,0 +1,309 @@ +yyinput = $str; + $st->yycursor = 0; + $st->yymarker = 0; + + + $yych = 0; + $yystate = 0; + while (true) { + switch ($yystate) { + case 0: + $yych = $st->yyinput[$st->yycursor]; + $st->yycursor += 1; + switch ($yych) { + case '0': + $yystate = 2; + break 2; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + $yystate = 4; + break 2; + default: + $yystate = 1; + break 2; + } + case 1: + return null; + case 2: + $st->yymarker = $st->yycursor; + $yych = $st->yyinput[$st->yycursor]; + switch ($yych) { + case 'B': + case 'b': + $st->yycursor += 1; + $yystate = 5; + break 2; + case 'X': + case 'x': + $st->yycursor += 1; + $yystate = 7; + break 2; + default: + $yystate = 3; + break 2; + } + case 3: + return parse_oct($st); + case 4: + $st->yycursor -= 1; + return parse_dec($st); + case 5: + $yych = $st->yyinput[$st->yycursor]; + switch ($yych) { + case '0': + case '1': + $st->yycursor += 1; + $yystate = 8; + break 2; + default: + $yystate = 6; + break 2; + } + case 6: + $st->yycursor = $st->yymarker; + $yystate = 3; + break 2; + case 7: + $yych = $st->yyinput[$st->yycursor]; + switch ($yych) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + $st->yycursor += 1; + $yystate = 9; + break 2; + default: + $yystate = 6; + break 2; + } + case 8: + $st->yycursor -= 1; + return parse_bin($st); + case 9: + $st->yycursor -= 1; + return parse_hex($st); + default: + throw new \Exception("internal lexer error"); + } + } + +} + +function parse_bin($st): int { + $n = 0; + while (true) { + + $yych = 0; + $yystate = 0; + while (true) { + switch ($yystate) { + case 0: + $yych = $st->yyinput[$st->yycursor]; + $st->yycursor += 1; + switch ($yych) { + case '0': + case '1': + $yystate = 2; + break 2; + default: + $yystate = 1; + break 2; + } + case 1: + return $n; + case 2: + $n = $n * 2 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; + default: + throw new \Exception("internal lexer error"); + } + } + + } +} + +function parse_oct($st): int { + $n = 0; + while (true) { + + $yych = 0; + $yystate = 0; + while (true) { + switch ($yystate) { + case 0: + $yych = $st->yyinput[$st->yycursor]; + $st->yycursor += 1; + switch ($yych) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + $yystate = 2; + break 2; + default: + $yystate = 1; + break 2; + } + case 1: + return $n; + case 2: + $n = $n * 8 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; + default: + throw new \Exception("internal lexer error"); + } + } + + } +} + +function parse_dec($st): int { + $n = 0; + while (true) { + + $yych = 0; + $yystate = 0; + while (true) { + switch ($yystate) { + case 0: + $yych = $st->yyinput[$st->yycursor]; + $st->yycursor += 1; + switch ($yych) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + $yystate = 2; + break 2; + default: + $yystate = 1; + break 2; + } + case 1: + return $n; + case 2: + + $n = $n * 10 + (ord($st->yyinput[$st->yycursor - 1]) - 48); + break 2; + + default: + throw new \Exception("internal lexer error"); + } + } + + } +} + +function parse_hex($st): int { + $n = 0; + while (true) { + + $yych = 0; + $yystate = 0; + while (true) { + switch ($yystate) { + case 0: + $yych = $st->yyinput[$st->yycursor]; + $st->yycursor += 1; + switch ($yych) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + $yystate = 2; + break 2; + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + $yystate = 3; + break 2; + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + $yystate = 4; + break 2; + default: + $yystate = 1; + break 2; + } + case 1: + return $n; + case 2: + $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; + case 3: + $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 55); break 2; + case 4: + $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 87); break 2; + default: + throw new \Exception("internal lexer error"); + } + } + + } +} + +function test($s, $n): void { + if (parse_u32($s) != $n) { + throw new \Exception("error!"); + } +} + +test("\0", null); +test("1234567890\0", 1234567890); +test("0b1101\0", 13); +test("0x7Fe\0", 2046); +test("0644\0", 420); +test("9999999999\0", 9999999999); diff --git a/examples/php/conditions/parse_u32_blocks.re b/examples/php/conditions/parse_u32_blocks.re new file mode 100644 index 000000000..fd8aa5097 --- /dev/null +++ b/examples/php/conditions/parse_u32_blocks.re @@ -0,0 +1,81 @@ +yyinput = $str; + $st->yycursor = 0; + $st->yymarker = 0; + + /*!re2c + '0b' / [01] { return parse_bin($st); } + "0" { return parse_oct($st); } + "" / [1-9] { return parse_dec($st); } + '0x' / [0-9a-fA-F] { return parse_hex($st); } + * { return null; } + */ +} + +function parse_bin($st): int { + $n = 0; + while (true) { + /*!re2c + [01] { $n = $n * 2 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; } + * { return $n; } + */ + } +} + +function parse_oct($st): int { + $n = 0; + while (true) { + /*!re2c + [0-7] { $n = $n * 8 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; } + * { return $n; } + */ + } +} + +function parse_dec($st): int { + $n = 0; + while (true) { + /*!re2c + [0-9] { + $n = $n * 10 + (ord($st->yyinput[$st->yycursor - 1]) - 48); + break 2; + } + * { return $n; } + */ + } +} + +function parse_hex($st): int { + $n = 0; + while (true) { + /*!re2c + [0-9] { $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 48); break 2; } + [a-f] { $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 87); break 2; } + [A-F] { $n = $n * 16 + (ord($st->yyinput[$st->yycursor - 1]) - 55); break 2; } + * { return $n; } + */ + } +} + +function test($s, $n): void { + if (parse_u32($s) != $n) { + throw new \Exception("error!"); + } +} + +test("\0", null); +test("1234567890\0", 1234567890); +test("0b1101\0", 13); +test("0x7Fe\0", 2046); +test("0644\0", 420); +test("9999999999\0", 9999999999); diff --git a/examples/php/conditions/parse_u32_conditions.php b/examples/php/conditions/parse_u32_conditions.php new file mode 100644 index 000000000..8ec3c5345 --- /dev/null +++ b/examples/php/conditions/parse_u32_conditions.php @@ -0,0 +1,256 @@ + '0b' / [01] :=> BIN + "0" :=> OCT + "" / [1-9] :=> DEC + '0x' / [0-9a-fA-F] :=> HEX + * { return null; } + + [01] { $n = $n * 2 + (ord($yyinput[$yycursor - 1]) - 48); break 2; } + [0-7] { $n = $n * 8 + (ord($yyinput[$yycursor - 1]) - 48); break 2; } + [0-9] { $n = $n * 10 + (ord($yyinput[$yycursor - 1]) - 48); break 2; } + [0-9] { $n = $n * 16 + (ord($yyinput[$yycursor - 1]) - 48); break 2; } + [a-f] { $n = $n * 16 + (ord($yyinput[$yycursor - 1]) - 87); break 2; } + [A-F] { $n = $n * 16 + (ord($yyinput[$yycursor - 1]) - 55); break 2; } + + * { return $n; } + */ + } +} + +function test($s, $n) { + if (parse_u32($s) != $n) { + throw new \Exception("error!"); + } +} + +test("\0", null); +test("1234567890\0", 1234567890); +test("0b1101\0", 13); +test("0x7Fe\0", 2046); +test("0644\0", 420); +test("9999999999\0", 9999999999); diff --git a/examples/php/encodings/unicode_identifier.php b/examples/php/encodings/unicode_identifier.php new file mode 100644 index 000000000..5bbd5b895 --- /dev/null +++ b/examples/php/encodings/unicode_identifier.php @@ -0,0 +1,22526 @@ += 0x01) { + $yystate = 7; + break 2; + } + if ($yylimit <= $yycursor) { + $yystate = 2; + break 2; + } + $yycursor += 1; + $yystate = 6; + break 2; + case 6: + $yych = $yyinput[$yycursor]; + $yystate = 7; + break 2; + case 7: + switch ($yych) { + case '\'': + $yycursor += 1; + $yystate = 8; + break 2; + case '\\': + $yycursor += 1; + $yystate = 9; + break 2; + default: + if ($yylimit <= $yycursor) { + $yystate = 11; + break 2; + } + $yycursor += 1; + $yystate = 6; + break 2; + } + case 8: + $count += 1; break 2; + case 9: + $yych = $yyinput[$yycursor]; + if ($yych <= 0x00) { + if ($yylimit <= $yycursor) { + $yystate = 11; + break 2; + } + $yycursor += 1; + $yystate = 6; + break 2; + } + $yycursor += 1; + $yystate = 6; + break 2; + case 10: + return $count; + case 11: + $yycursor = $yymarker; + $yystate = 2; + break 2; + default: + throw new \Exception("internal lexer error"); + } + } + + } +} + +function test($s, $n) { + if (lex($s) != $n) { + throw new \Exception("error!"); + } +} + +test("\0", 0); +test("'qu\0tes' 'are' 'fine: \\'' \0", 3); +test("'unterminated\\'\0", -1); diff --git a/examples/php/eof/03_eof_rule.re b/examples/php/eof/03_eof_rule.re new file mode 100644 index 000000000..ee68445d8 --- /dev/null +++ b/examples/php/eof/03_eof_rule.re @@ -0,0 +1,33 @@ += 0x01) { + yystate = 7 + continue yyl + } + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord) == OK) { + yystate = 5 + continue yyl + } + yystate = 2 + continue yyl + } + yyrecord.yycursor += 1; + yystate = 6 + continue yyl + case 6: + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + yystate = 7 + continue yyl + case 7: + switch (yych) { + case 0x27: + yyrecord.yycursor += 1; + yystate = 8 + continue yyl + case 0x5C: + yyrecord.yycursor += 1; + yystate = 9 + continue yyl + default: + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord) == OK) { + yystate = 6 + continue yyl + } + yystate = 11 + continue yyl + } + yyrecord.yycursor += 1; + yystate = 6 + continue yyl + } + case 8: + { count += 1; continue loop } + case 9: + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + if (yych <= 0x00) { + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord) == OK) { + yystate = 9 + continue yyl + } + yystate = 11 + continue yyl + } + yyrecord.yycursor += 1; + yystate = 6 + continue yyl + } + yyrecord.yycursor += 1; + yystate = 6 + continue yyl + case 10: + { return count } + case 11: + yyrecord.yycursor = yyrecord.yymarker; + yystate = 2 + continue yyl + default: + throw "internal lexer error" + } + } +} + + } +} + +function main() { + let fname = "input" + + // Create input file. + let content = "'qu\0tes' 'are' 'fine: \\'' ".repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fs.openSync(fname, 'r'), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw "error :[" } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() diff --git a/examples/php/fill/01_fill.re b/examples/php/fill/01_fill.re new file mode 100644 index 000000000..3539c7db3 --- /dev/null +++ b/examples/php/fill/01_fill.re @@ -0,0 +1,83 @@ +file = fopen($fname, 'r'); + $st->yyinput = str_repeat("\0", BUFSIZE); + $st->yylimit = $limit; + $st->yycursor = $limit; + $st->yymarker = $limit; + $st->token = $limit; + $st->eof = false; + + // Run lexer on the prepared file. + if (lex($st, 0) != 3 * BUFSIZE) { + throw new \Exception("error :["); + } + + // Cleanup. + fclose($st->file); + unlink($fname); +} + +main(); diff --git a/examples/php/fill/02_fill.js b/examples/php/fill/02_fill.js new file mode 100644 index 000000000..cbdd8c510 --- /dev/null +++ b/examples/php/fill/02_fill.js @@ -0,0 +1,150 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const fs = require('fs') + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 +const YYMAXFILL = 1 + + +function fill(st, need) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < need) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yylimit -= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + if (nread < want) { + st.eof = true // end of file + st.yyinput.write("\0".repeat(YYMAXFILL), st.yylimit) + st.yylimit += YYMAXFILL + } + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord, 1) != OK) return -1; + } + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + yyrecord.yycursor += 1; + switch (yych) { + case 0x00: + yystate = 1 + continue yyl + case 0x20: + yystate = 3 + continue yyl + case 0x27: + yystate = 5 + continue yyl + default: + yystate = 2 + continue yyl + } + case 1: + { + // Check that it is the sentinel, not some unexpected null. + return yyrecord.token == yyrecord.yylimit - YYMAXFILL ? count : -1 + } + case 2: + { return -1 } + case 3: + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord, 1) != OK) return -1; + } + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + switch (yych) { + case 0x20: + yyrecord.yycursor += 1; + yystate = 3 + continue yyl + default: + yystate = 4 + continue yyl + } + case 4: + { continue loop } + case 5: + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord, 1) != OK) return -1; + } + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + yyrecord.yycursor += 1; + switch (yych) { + case 0x27: + yystate = 6 + continue yyl + case 0x5C: + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 6: + { count += 1; continue loop } + case 7: + if (yyrecord.yylimit <= yyrecord.yycursor) { + if (fill(yyrecord, 1) != OK) return -1; + } + yyrecord.yycursor += 1; + yystate = 5 + continue yyl + default: + throw "internal lexer error" + } + } +} + + } +} + +function main() { + let fname = "input" + + // Create input file. + let content = "'qu\0tes' 'are' 'fine: \\'' ".repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fs.openSync(fname, 'r'), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw "error :[" } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() diff --git a/examples/php/fill/02_fill.re b/examples/php/fill/02_fill.re new file mode 100644 index 000000000..ead7a26e2 --- /dev/null +++ b/examples/php/fill/02_fill.re @@ -0,0 +1,82 @@ +// re2js $INPUT -o $OUTPUT + +const fs = require('fs') + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 +/*!max:re2c*/ + +function fill(st, need) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < need) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yylimit -= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + if (nread < want) { + st.eof = true // end of file + st.yyinput.write("\0".repeat(YYMAXFILL), st.yylimit) + st.yylimit += YYMAXFILL + } + + return OK +} + +function lex(yyrecord, count) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:YYPEEK = "readUInt8"; + re2c:YYFILL = "if (fill(yyrecord, @@) != OK) return -1;"; + + str = ['] ([^'\\] | [\\][^])* [']; + + [\x00] { + // Check that it is the sentinel, not some unexpected null. + return yyrecord.token == yyrecord.yylimit - YYMAXFILL ? count : -1 + } + str { count += 1; continue loop } + [ ]+ { continue loop } + * { return -1 } + */ + } +} + +function main() { + let fname = "input" + + // Create input file. + let content = "'qu\0tes' 'are' 'fine: \\'' ".repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fs.openSync(fname, 'r'), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + token: limit, + eof: false + } + + // Run lexer on the prepared file. + if (lex(st, 0) != 3 * BUFSIZE) { throw "error :[" } + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() diff --git a/examples/php/headers/header.js b/examples/php/headers/header.js new file mode 100644 index 000000000..a57ab501a --- /dev/null +++ b/examples/php/headers/header.js @@ -0,0 +1,68 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT --header lexer/state.js + +let state = require('./lexer/state.js'); + + + +function lex(yyrecord) { + let t + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyrecord.yyinput.charCodeAt(yyrecord.yycursor) + switch (yych) { + case 0x61: + yyrecord.yycursor += 1; + yystate = 0 + continue yyl + case 0x62: + yyrecord.yyt1 = yyrecord.yycursor; + yyrecord.yycursor += 1; + yystate = 2 + continue yyl + default: + yyrecord.yyt1 = yyrecord.yycursor; + yystate = 1 + continue yyl + } + case 1: + t = yyrecord.yyt1; + { return t } + case 2: + yych = yyrecord.yyinput.charCodeAt(yyrecord.yycursor) + switch (yych) { + case 0x62: + yyrecord.yycursor += 1; + yystate = 2 + continue yyl + default: + yystate = 1 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +if (lex(state.mk_state("ab\0")) != 1) { + throw "error!" +} +// Generated by re2js + +exports.mk_state = function(str) { + return { + yyinput: str, + yyt1: 0, + + yycursor: 0 + } +} +js/headers/header.re:23:21: warning: rule matches empty string [-Wmatch-empty-string] diff --git a/examples/php/headers/header.re b/examples/php/headers/header.re new file mode 100644 index 000000000..d7b43d179 --- /dev/null +++ b/examples/php/headers/header.re @@ -0,0 +1,29 @@ +// re2js $INPUT -o $OUTPUT --header lexer/state.js + +let state = require('./lexer/state.js'); + +/*!header:re2c:on*/ +exports.mk_state = function(str) { + return { + yyinput: str, + /*!stags:re2c format = "@@: 0,\n"; */ + yycursor: 0 + } +} +/*!header:re2c:off*/ + +function lex(yyrecord) { + let t + /*!re2c + re2c:api = record; + re2c:tags = 1; + re2c:yyfill:enable = 0; + re2c:header = "lexer/state.js"; + + [a]* @t [b]* { return t } + */ +} + +if (lex(state.mk_state("ab\0")) != 1) { + throw "error!" +} diff --git a/examples/php/headers/lexer/state.js b/examples/php/headers/lexer/state.js new file mode 100644 index 000000000..0c419af00 --- /dev/null +++ b/examples/php/headers/lexer/state.js @@ -0,0 +1,10 @@ +// Generated by re2c + +exports.mk_state = function(str) { + return { + yyinput: str, + yyt1: 0, + + yycursor: 0 + } +} diff --git a/examples/php/includes/definitions.js b/examples/php/includes/definitions.js new file mode 100644 index 000000000..92764d5d8 --- /dev/null +++ b/examples/php/includes/definitions.js @@ -0,0 +1,7 @@ +const INT = 1 +const FLOAT = 2 +const NAN = 3 + +/*!re2c + number = [1-9][0-9]*; +*/ diff --git a/examples/php/includes/extra_rules.re.inc b/examples/php/includes/extra_rules.re.inc new file mode 100644 index 000000000..983682218 --- /dev/null +++ b/examples/php/includes/extra_rules.re.inc @@ -0,0 +1,6 @@ +// floating-point numbers +frac = [0-9]* "." [0-9]+ | [0-9]+ "."; +exp = 'e' [+-]? [0-9]+; +float = frac exp? | [0-9]+ exp; + +float { return FLOAT } diff --git a/examples/php/includes/include.js b/examples/php/includes/include.js new file mode 100644 index 000000000..db3704dc9 --- /dev/null +++ b/examples/php/includes/include.js @@ -0,0 +1,283 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const INT = 1 +const FLOAT = 2 +const NAN = 3 + + + + +function lex(yyinput) { + let yycursor = 0 + +{ + let yych = 0 + let yyaccept = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + yycursor += 1; + switch (yych) { + case 0x2E: + yystate = 3 + continue yyl + case 0x30: + yystate = 4 + continue yyl + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yystate = 5 + continue yyl + default: + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return NAN } + case 3: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yyaccept = 0; + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 7 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 9 + continue yyl + case 0x45: + case 0x65: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 2 + continue yyl + } + case 5: + yyaccept = 1; + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 7 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 5 + continue yyl + case 0x45: + case 0x65: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 6 + continue yyl + } + case 6: + { return INT } + case 7: + yyaccept = 2; + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + case 0x45: + case 0x65: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 8 + continue yyl + } + case 8: + { return FLOAT } + case 9: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 7 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 9 + continue yyl + case 0x45: + case 0x65: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 10 + continue yyl + } + case 10: + yycursor = yymarker; + switch (yyaccept) { + case 0: + yystate = 2 + continue yyl + case 1: + yystate = 6 + continue yyl + default: + yystate = 8 + continue yyl + } + case 11: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2B: + case 0x2D: + yycursor += 1; + yystate = 12 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 13 + continue yyl + default: + yystate = 10 + continue yyl + } + case 12: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 13 + continue yyl + default: + yystate = 10 + continue yyl + } + case 13: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 13 + continue yyl + default: + yystate = 8 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +function test(s, n) { + if (lex(s) != n) throw "error!" +} + +test("123\0", INT) +test("123.4567\0", FLOAT) diff --git a/examples/php/includes/include.re b/examples/php/includes/include.re new file mode 100644 index 000000000..211c67a2c --- /dev/null +++ b/examples/php/includes/include.re @@ -0,0 +1,21 @@ +// re2js $INPUT -o $OUTPUT + +/*!include:re2c "definitions.js" */ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + * { return NAN } + number { return INT } + !include "extra_rules.re.inc"; + */ +} + +function test(s, n) { + if (lex(s) != n) throw "error!" +} + +test("123\0", INT) +test("123.4567\0", FLOAT) diff --git a/examples/php/reuse/reuse.js b/examples/php/reuse/reuse.js new file mode 100644 index 000000000..c189c766a --- /dev/null +++ b/examples/php/reuse/reuse.js @@ -0,0 +1,202 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT --input-encoding utf8 + +// This example supports multiple input encodings: UTF-8 and UTF-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding-specific configurations. + + +function lex_utf8(yyinput) { + let yycursor = 0 + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.at(yycursor) + yycursor += 1; + switch (yych) { + case 0xE2: + yystate = 3 + continue yyl + default: + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + yymarker = yycursor; + yych = yyinput.at(yycursor) + switch (yych) { + case 0x88: + yycursor += 1; + yystate = 4 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x80: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 5 + continue yyl + } + case 5: + yycursor = yymarker; + yystate = 2 + continue yyl + case 6: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x78: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 7: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x20: + yycursor += 1; + yystate = 8 + continue yyl + default: + yystate = 5 + continue yyl + } + case 8: + yych = yyinput.at(yycursor) + switch (yych) { + case 0xE2: + yycursor += 1; + yystate = 9 + continue yyl + default: + yystate = 5 + continue yyl + } + case 9: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x88: + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 5 + continue yyl + } + case 10: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x83: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 5 + continue yyl + } + case 11: + yych = yyinput.at(yycursor) + switch (yych) { + case 0x79: + yycursor += 1; + yystate = 12 + continue yyl + default: + yystate = 5 + continue yyl + } + case 12: + { return yycursor } + default: + throw "internal lexer error" + } + } +} + +} + +function lex_utf32(yyinput) { + let yycursor = 0 + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.at(yycursor) + yycursor += 1; + if (yych === 0x00002200) { + yystate = 2 + continue yyl + } + yystate = 1 + continue yyl + case 1: + { return null } + case 2: + yymarker = yycursor; + yych = yyinput.at(yycursor) + if (yych !== 0x00000078) { + yystate = 1 + continue yyl + } + yycursor += 1; + yych = yyinput.at(yycursor) + if (yych === 0x00000020) { + yycursor += 1; + yystate = 4 + continue yyl + } + yystate = 3 + continue yyl + case 3: + yycursor = yymarker; + yystate = 1 + continue yyl + case 4: + yych = yyinput.at(yycursor) + if (yych !== 0x00002203) { + yystate = 3 + continue yyl + } + yycursor += 1; + yych = yyinput.at(yycursor) + if (yych !== 0x00000079) { + yystate = 3 + continue yyl + } + yycursor += 1; + { return yycursor } + default: + throw "internal lexer error" + } + } +} + +} + +function test(f, s) { + if (f(s) != s.length) throw "error!" +} + +test(lex_utf8, [0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79]) +test(lex_utf32, [0x2200, 0x78, 0x20, 0x2203, 0x79]) diff --git a/examples/php/reuse/reuse.re b/examples/php/reuse/reuse.re new file mode 100644 index 000000000..a802344f1 --- /dev/null +++ b/examples/php/reuse/reuse.re @@ -0,0 +1,33 @@ +// re2js $INPUT -o $OUTPUT --input-encoding utf8 + +// This example supports multiple input encodings: UTF-8 and UTF-32. +// Both lexers are generated from the same rules block, and the use +// blocks add only encoding-specific configurations. +/*!rules:re2c + re2c:yyfill:enable = 0; + re2c:YYPEEK = "at"; + + "∀x ∃y" { return yycursor } + * { return null } +*/ + +function lex_utf8(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf8 = 1; + */ +} + +function lex_utf32(yyinput) { + let yycursor = 0 + /*!use:re2c + re2c:encoding:utf32 = 1; + */ +} + +function test(f, s) { + if (f(s) != s.length) throw "error!" +} + +test(lex_utf8, [0xe2, 0x88, 0x80, 0x78, 0x20, 0xe2, 0x88, 0x83, 0x79]) +test(lex_utf32, [0x2200, 0x78, 0x20, 0x2203, 0x79]) diff --git a/examples/php/reuse/usedir.js b/examples/php/reuse/usedir.js new file mode 100644 index 000000000..c9e9a5746 --- /dev/null +++ b/examples/php/reuse/usedir.js @@ -0,0 +1,309 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// ('colors' and 'fish') are merged into one. The 'salmon' rule occurs +// in both blocks; the 'fish' block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +const COLOR = 1 +const FISH = 2 +const DUNNO = 3 + + + + + +function lex(yyinput) { + let yycursor = 0 + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + yycursor += 1; + switch (yych) { + case 0x65: + yystate = 3 + continue yyl + case 0x68: + yystate = 4 + continue yyl + case 0x6D: + yystate = 5 + continue yyl + case 0x72: + yystate = 6 + continue yyl + case 0x73: + yystate = 7 + continue yyl + default: + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return DUNNO } + case 3: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x65: + yycursor += 1; + yystate = 8 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x61: + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 2 + continue yyl + } + case 5: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x61: + yycursor += 1; + yystate = 11 + continue yyl + default: + yystate = 2 + continue yyl + } + case 6: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x65: + yycursor += 1; + yystate = 12 + continue yyl + default: + yystate = 2 + continue yyl + } + case 7: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x61: + yycursor += 1; + yystate = 13 + continue yyl + default: + yystate = 2 + continue yyl + } + case 8: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6C: + yycursor += 1; + yystate = 14 + continue yyl + default: + yystate = 9 + continue yyl + } + case 9: + yycursor = yymarker; + yystate = 2 + continue yyl + case 10: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x64: + yycursor += 1; + yystate = 15 + continue yyl + default: + yystate = 9 + continue yyl + } + case 11: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x67: + yycursor += 1; + yystate = 16 + continue yyl + default: + yystate = 9 + continue yyl + } + case 12: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x64: + yycursor += 1; + yystate = 17 + continue yyl + default: + yystate = 9 + continue yyl + } + case 13: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6C: + yycursor += 1; + yystate = 18 + continue yyl + default: + yystate = 9 + continue yyl + } + case 14: + { return FISH } + case 15: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x64: + yycursor += 1; + yystate = 19 + continue yyl + default: + yystate = 9 + continue yyl + } + case 16: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x65: + yycursor += 1; + yystate = 20 + continue yyl + default: + yystate = 9 + continue yyl + } + case 17: + { return COLOR } + case 18: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6D: + yycursor += 1; + yystate = 21 + continue yyl + default: + yystate = 9 + continue yyl + } + case 19: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6F: + yycursor += 1; + yystate = 22 + continue yyl + default: + yystate = 9 + continue yyl + } + case 20: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6E: + yycursor += 1; + yystate = 23 + continue yyl + default: + yystate = 9 + continue yyl + } + case 21: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6F: + yycursor += 1; + yystate = 24 + continue yyl + default: + yystate = 9 + continue yyl + } + case 22: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x63: + yycursor += 1; + yystate = 25 + continue yyl + default: + yystate = 9 + continue yyl + } + case 23: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x74: + yycursor += 1; + yystate = 26 + continue yyl + default: + yystate = 9 + continue yyl + } + case 24: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6E: + yycursor += 1; + yystate = 14 + continue yyl + default: + yystate = 9 + continue yyl + } + case 25: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x6B: + yycursor += 1; + yystate = 14 + continue yyl + default: + yystate = 9 + continue yyl + } + case 26: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x61: + yycursor += 1; + yystate = 17 + continue yyl + default: + yystate = 9 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +function test(s, n) { if (lex(s) != n) throw "error!"; } + +test("salmon", FISH) +test("what?", DUNNO) diff --git a/examples/php/reuse/usedir.re b/examples/php/reuse/usedir.re new file mode 100644 index 000000000..7d8656877 --- /dev/null +++ b/examples/php/reuse/usedir.re @@ -0,0 +1,37 @@ +// re2js $INPUT -o $OUTPUT + +// This example shows how to combine reusable re2c blocks: two blocks +// ('colors' and 'fish') are merged into one. The 'salmon' rule occurs +// in both blocks; the 'fish' block takes priority because it is used +// earlier. Default rule * occurs in all three blocks; the local (not +// inherited) definition takes priority. + +const COLOR = 1 +const FISH = 2 +const DUNNO = 3 + +/*!rules:re2c:colors + * { throw "ah" } + "red" | "salmon" | "magenta" { return COLOR } +*/ + +/*!rules:re2c:fish + * { throw "oh" } + "haddock" | "salmon" | "eel" { return FISH } +*/ + +function lex(yyinput) { + let yycursor = 0 + /*!re2c + re2c:yyfill:enable = 0; + + !use:fish; + !use:colors; + * { return DUNNO } // overrides inherited '*' rules + */ +} + +function test(s, n) { if (lex(s) != n) throw "error!"; } + +test("salmon", FISH) +test("what?", DUNNO) diff --git a/examples/php/state/push.js b/examples/php/state/push.js new file mode 100644 index 000000000..105fb6c1a --- /dev/null +++ b/examples/php/state/push.js @@ -0,0 +1,294 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT -f + +const fs = require('fs') + +// Use a small buffer to cover the case when a lexeme doesn't fit. +// In real world use a larger buffer. +const BUFSIZE = 10 +const DEBUG = false +const END = 0 +const READY = 1 +const WAITING = 2 +const BIG_PACKET = 3 +const BAD_PACKET = 4 + +function log() { + if (DEBUG) console.log.apply(console, arguments) +} + +function fill(st) { + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return BIG_PACKET + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yymarker -= st.token; + st.yylimit -= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return READY +} + +function lex(yyrecord) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + +{ + let yystate = yyrecord.yystate + yyl: while (true) { + switch (yystate) { + case -1: + case 0: + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + switch (yych) { + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + yyrecord.yycursor += 1; + yystate = 3 + continue yyl + default: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yyrecord.yystate = 8; + return WAITING + } + yyrecord.yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + yyrecord.yystate = -1; + { return BAD_PACKET } + case 3: + yyrecord.yymarker = yyrecord.yycursor; + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + switch (yych) { + case 0x3B: + yyrecord.yycursor += 1; + yystate = 4 + continue yyl + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + yyrecord.yycursor += 1; + yystate = 5 + continue yyl + default: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yyrecord.yystate = 9; + return WAITING + } + yystate = 2 + continue yyl + } + case 4: + yyrecord.yystate = -1; + { yyrecord.received += 1; continue loop } + case 5: + yych = yyrecord.yyinput.readUInt8(yyrecord.yycursor) + switch (yych) { + case 0x3B: + yyrecord.yycursor += 1; + yystate = 4 + continue yyl + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + yyrecord.yycursor += 1; + yystate = 5 + continue yyl + default: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yyrecord.yystate = 10; + return WAITING + } + yystate = 6 + continue yyl + } + case 6: + yyrecord.yycursor = yyrecord.yymarker; + yystate = 2 + continue yyl + case 7: + yyrecord.yystate = -1; + { return END } + case 8: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yystate = 7 + continue yyl + } + yystate = 0 + continue yyl + case 9: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yystate = 2 + continue yyl + } + yystate = 3 + continue yyl + case 10: + if (yyrecord.yylimit <= yyrecord.yycursor) { + yystate = 6 + continue yyl + } + yystate = 5 + continue yyl + default: + throw "internal lexer error" + } + } +} + + } +} + +function test(packets, expect) { + // Emulate a "pipe" by opening the same file for reading and writing. + let fname = "input" + let fw = fs.openSync(fname, 'w'); + let fr = fs.openSync(fname, 'r'); + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fr, + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + yystate: -1, + received: 0 + } + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + let send = 0 + let status + loop: while (true) { + status = lex(st) + + if (status == END) { + log("done: got", st.received, "packets") + break loop + } else if (status == WAITING) { + log("waiting..."); + + if (send < packets.length) { + log("sent packet", send, packets[send]) + fs.writeFileSync(fw, packets[send]) + send += 1 + } + + status = fill(st) + log("queue:", st.yyinput.toString()) + if (status == BIG_PACKET) { + log("error: packet too big") + break loop + } + + if (status != READY) throw "expected READY" + } else { + if (status != BAD_PACKET) throw "expected BAD_PACKET" + log("error: ill-formed packet") + break loop + } + } + + // Check results. + if (status != expect) throw "unexpected status" + if (status == END && st.received != send) "unexpected packet count" + + // Cleanup. + fs.unlinkSync(fname, function(err){ if (err) throw err; }) +} + +function main() { + test([], END) + test(["zero;", "one;", "two;", "three;", "four;"], END) + test(["zer0;"], BAD_PACKET) + test(["goooooooooogle;"], BIG_PACKET) +} + +main() diff --git a/examples/php/state/push.re b/examples/php/state/push.re new file mode 100644 index 000000000..813d85904 --- /dev/null +++ b/examples/php/state/push.re @@ -0,0 +1,126 @@ +// re2js $INPUT -o $OUTPUT -f + +const fs = require('fs') + +// Use a small buffer to cover the case when a lexeme doesn't fit. +// In real world use a larger buffer. +const BUFSIZE = 10 +const DEBUG = false +const END = 0 +const READY = 1 +const WAITING = 2 +const BIG_PACKET = 3 +const BAD_PACKET = 4 + +function log() { + if (DEBUG) console.log.apply(console, arguments) +} + +function fill(st) { + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return BIG_PACKET + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yymarker -= st.token; + st.yylimit -= st.token; + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return READY +} + +function lex(yyrecord) { + loop: while (true) { + yyrecord.token = yyrecord.yycursor + /*!re2c + re2c:api = record; + re2c:YYPEEK = "readUInt8"; + re2c:YYFILL = "return WAITING"; + re2c:eof = 0; + + packet = [a-z]+[;]; + + * { return BAD_PACKET } + $ { return END } + packet { yyrecord.received += 1; continue loop } + */ + } +} + +function test(packets, expect) { + // Emulate a "pipe" by opening the same file for reading and writing. + let fname = "input" + let fw = fs.openSync(fname, 'w'); + let fr = fs.openSync(fname, 'r'); + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fr, + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + yystate: -1, + received: 0 + } + + // Main loop. The buffer contains incomplete data which appears packet by + // packet. When the lexer needs more input it saves its internal state and + // returns to the caller which should provide more input and resume lexing. + let send = 0 + let status + loop: while (true) { + status = lex(st) + + if (status == END) { + log("done: got", st.received, "packets") + break loop + } else if (status == WAITING) { + log("waiting..."); + + if (send < packets.length) { + log("sent packet", send, packets[send]) + fs.writeFileSync(fw, packets[send]) + send += 1 + } + + status = fill(st) + log("queue:", st.yyinput.toString()) + if (status == BIG_PACKET) { + log("error: packet too big") + break loop + } + + if (status != READY) throw "expected READY" + } else { + if (status != BAD_PACKET) throw "expected BAD_PACKET" + log("error: ill-formed packet") + break loop + } + } + + // Check results. + if (status != expect) throw "unexpected status" + if (status == END && st.received != send) "unexpected packet count" + + // Cleanup. + fs.unlinkSync(fname, function(err){ if (err) throw err; }) +} + +function main() { + test([], END) + test(["zero;", "one;", "two;", "three;", "four;"], END) + test(["zer0;"], BAD_PACKET) + test(["goooooooooogle;"], BIG_PACKET) +} + +main() diff --git a/examples/php/submatch/01_stags.js b/examples/php/submatch/01_stags.js new file mode 100644 index 000000000..e671755a1 --- /dev/null +++ b/examples/php/submatch/01_stags.js @@ -0,0 +1,231 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + let t1 +let t2 +let t3 +let t4 +let t5 + + + // Intermediate tag variables used by the lexer (must be autogenerated). + let yyt1 +let yyt2 +let yyt3 +let yyt4 + + + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt1 = yycursor; + yycursor += 1; + yystate = 3 + continue yyl + default: + yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt2 = yycursor; + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 5: + yycursor = yymarker; + yystate = 2 + continue yyl + case 6: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 5 + continue yyl + } + case 7: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + yyt3 = yycursor; + yyt4 = -1; + yycursor += 1; + yystate = 8 + continue yyl + case 0x2E: + yyt3 = yycursor; + yycursor += 1; + yystate = 9 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 8: + t1 = yyt1; + t3 = yyt2; + t4 = yyt3; + t5 = yyt4; + t2 = yyt2; + t2 -= 1; + { + return { + major: Number(yyinput.substring(t1, t2)), + minor: Number(yyinput.substring(t3, t4)), + patch: t5 == -1 ? 0 : Number(yyinput.substring(t5, yycursor - 1)) + } + } + case 9: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt4 = yycursor; + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 5 + continue yyl + } + case 10: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + yycursor += 1; + yystate = 8 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 5 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/examples/php/submatch/01_stags.re b/examples/php/submatch/01_stags.re new file mode 100644 index 000000000..964f3236f --- /dev/null +++ b/examples/php/submatch/01_stags.re @@ -0,0 +1,33 @@ +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = "let @@\n"; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = "let @@\n"; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0-9]+; + + @t1 num @t2 "." @t3 num @t4 ("." @t5 num)? [\x00] { + return { + major: Number(yyinput.substring(t1, t2)), + minor: Number(yyinput.substring(t3, t4)), + patch: t5 == -1 ? 0 : Number(yyinput.substring(t5, yycursor - 1)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/examples/php/submatch/01_stags_fill.js b/examples/php/submatch/01_stags_fill.js new file mode 100644 index 000000000..879a70963 --- /dev/null +++ b/examples/php/submatch/01_stags_fill.js @@ -0,0 +1,333 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); +const fs = require('fs') + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yymarker -= st.token; + st.yylimit -= st.token; + if (st.yyt1 != -1) st.yyt1 -= st.token +if (st.yyt2 != -1) st.yyt2 -= st.token +if (st.yyt3 != -1) st.yyt3 -= st.token + + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(st) { + let vers = [] + loop: while (true) { + st.token = st.yycursor + + // Final tag variables available in semantic action. + let t1 +let t2 +let t3 +let t4 + + + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yycursor += 1; + yystate = 3 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 0 + continue yyl + } + yystate = 11 + continue yyl + } + st.yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + st.yymarker = st.yycursor; + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x2E: + st.yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yycursor += 1; + yystate = 6 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 3 + continue yyl + } + } + yystate = 2 + continue yyl + } + case 4: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yyt1 = st.yycursor; + st.yycursor += 1; + yystate = 7 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 4 + continue yyl + } + } + yystate = 5 + continue yyl + } + case 5: + st.yycursor = st.yymarker; + yystate = 2 + continue yyl + case 6: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x2E: + st.yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yycursor += 1; + yystate = 6 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 6 + continue yyl + } + } + yystate = 5 + continue yyl + } + case 7: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x0A: + st.yyt2 = st.yycursor; + st.yyt3 = -1; + st.yycursor += 1; + yystate = 8 + continue yyl + case 0x2E: + st.yyt2 = st.yycursor; + st.yycursor += 1; + yystate = 9 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yycursor += 1; + yystate = 7 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 7 + continue yyl + } + } + yystate = 5 + continue yyl + } + case 8: + t2 = st.yyt1; + t3 = st.yyt2; + t4 = st.yyt3; + t1 = st.yyt1; + t1 -= 1; + { + vers.push({ + major: Number(st.yyinput.subarray(st.token, t1)), + minor: Number(st.yyinput.subarray(t2, t3)), + patch: t4 == -1 ? 0 : Number(st.yyinput.subarray(t4, st.yycursor - 1)) + }) + continue loop + } + case 9: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yyt3 = st.yycursor; + st.yycursor += 1; + yystate = 10 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 9 + continue yyl + } + } + yystate = 5 + continue yyl + } + case 10: + yych = st.yyinput.readUInt8(st.yycursor) + switch (yych) { + case 0x0A: + st.yycursor += 1; + yystate = 8 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + st.yycursor += 1; + yystate = 10 + continue yyl + default: + if (st.yylimit <= st.yycursor) { + if (fill(st) == OK) { + yystate = 10 + continue yyl + } + } + yystate = 5 + continue yyl + } + case 11: + { return vers } + default: + throw "internal lexer error" + } + } +} + + } +} + +function main() { + let fname = "input" + + // Create input file. + let content = "1.22.333\n".repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fs.openSync(fname, 'r'), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + // Intermediate tag variables used by the lexer (must be autogenerated). + yyt1: -1, +yyt2: -1, +yyt3: -1, + + eof: false + } + + // Run lexer on the prepared file. + assert.deepEqual(lex(st), Array(BUFSIZE).fill({major: 1, minor: 22, patch: 333})) + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() diff --git a/examples/php/submatch/01_stags_fill.re b/examples/php/submatch/01_stags_fill.re new file mode 100644 index 000000000..ca4d73dfa --- /dev/null +++ b/examples/php/submatch/01_stags_fill.re @@ -0,0 +1,95 @@ +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); +const fs = require('fs') + +const BUFSIZE = 4096 +const OK = 0 +const EOF = 1 +const LONG_LEXEME = 2 + +function fill(st) { + if (st.eof) return EOF + + // Error: lexeme too long. In real life could reallocate a larger buffer. + if (st.token < 1) return LONG_LEXEME + + // Shift buffer contents (discard everything up to the current token). + st.yyinput.copy(st.yyinput, 0, st.token, st.yylimit) + st.yycursor -= st.token; + st.yymarker -= st.token; + st.yylimit -= st.token; + /*!stags:re2c format = "if (st.@@ != -1) st.@@ -= st.token\n"; */ + st.token = 0; + + // Read a new chunk of data from file and append it to `yyinput`. + let want = BUFSIZE - st.yylimit - 1 // -1 for sentinel + let nread = fs.readSync(st.file, st.yyinput, st.yylimit, want) + st.eof = nread < want // end of file? + st.yylimit += nread + st.yyinput.writeUInt8(0, st.yylimit) // sentinel + + return OK +} + +function lex(st) { + let vers = [] + loop: while (true) { + st.token = st.yycursor + + // Final tag variables available in semantic action. + /*!svars:re2c format = "let @@\n"; */ + + /*!re2c + re2c:api = record; + re2c:yyrecord = st; + re2c:YYPEEK = "readUInt8"; + re2c:YYFILL = "fill(st) == OK"; + re2c:eof = 0; + re2c:tags = 1; + + num = [0-9]+; + + num @t1 "." @t2 num @t3 ("." @t4 num)? [\n] { + vers.push({ + major: Number(st.yyinput.subarray(st.token, t1)), + minor: Number(st.yyinput.subarray(t2, t3)), + patch: t4 == -1 ? 0 : Number(st.yyinput.subarray(t4, st.yycursor - 1)) + }) + continue loop + } + $ { return vers } + * { return null } + */ + } +} + +function main() { + let fname = "input" + + // Create input file. + let content = "1.22.333\n".repeat(BUFSIZE) + fs.writeFileSync(fname, content, function(err) { if (err) throw err; }) + + // Init lexer state. + let limit = BUFSIZE - 1 // exclude terminating null + let st = { + file: fs.openSync(fname, 'r'), + yyinput: Buffer.alloc(BUFSIZE), + yylimit: limit, + yycursor: limit, + yymarker: limit, + token: limit, + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = "@@: -1,\n"; */ + eof: false + } + + // Run lexer on the prepared file. + assert.deepEqual(lex(st), Array(BUFSIZE).fill({major: 1, minor: 22, patch: 333})) + + // Cleanup. + fs.unlink(fname, function(err){ if (err) throw err; }) +} + +main() diff --git a/examples/php/submatch/02_mtags.js b/examples/php/submatch/02_mtags.js new file mode 100644 index 000000000..50514ed03 --- /dev/null +++ b/examples/php/submatch/02_mtags.js @@ -0,0 +1,200 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const assert = require('assert') + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + let t1 +let t2 + + let t3 +let t4 + + + // Intermediate tag variables used by the lexer (must be autogenerated). + let yyt1 +let yyt2 + + let yytm3 = [] +let yytm4 = [] + + + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt1 = yycursor; + yycursor += 1; + yystate = 3 + continue yyl + default: + yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + + + yyt2 = yycursor; + yycursor += 1; + yystate = 4 + continue yyl + case 0x2E: + yyt2 = yycursor; + yycursor += 1; + yystate = 5 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + t1 = yyt1; + t2 = yyt2; + t3 = yytm3; + t4 = yytm4; + { + let vers = [Number(yyinput.substring(t1, t2))] + for (let i = 0; i < t3.length; ++i) { + vers.push(Number(yyinput.substring(t3[i], t4[i]))) + } + return vers + } + case 5: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yytm3.push(yycursor) + yycursor += 1; + yystate = 8 + continue yyl + default: + yystate = 6 + continue yyl + } + case 6: + yycursor = yymarker; + yystate = 2 + continue yyl + case 7: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + + + yyt2 = yycursor; + yycursor += 1; + yystate = 4 + continue yyl + case 0x2E: + yyt2 = yycursor; + yycursor += 1; + yystate = 5 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 6 + continue yyl + } + case 8: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + yytm4.push(yycursor) + yycursor += 1; + yystate = 4 + continue yyl + case 0x2E: + yytm4.push(yycursor) + yycursor += 1; + yystate = 5 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 8 + continue yyl + default: + yystate = 6 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +assert.deepEqual(parse("1\0"), [1]) +assert.deepEqual(parse("1.2.3.4.5.6.7\0"), [1, 2, 3, 4, 5, 6, 7]) +assert.deepEqual(parse("1.2.\0"), null) diff --git a/examples/php/submatch/02_mtags.re b/examples/php/submatch/02_mtags.re new file mode 100644 index 000000000..425d1a418 --- /dev/null +++ b/examples/php/submatch/02_mtags.re @@ -0,0 +1,37 @@ +// re2js $INPUT -o $OUTPUT + +const assert = require('assert') + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = "let @@\n"; */ + /*!mvars:re2c format = "let @@\n"; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = "let @@\n"; */ + /*!mtags:re2c format = "let @@ = []\n"; */ + + /*!re2c + re2c:YYMTAGP = "@@.push(yycursor)"; + re2c:YYMTAGN = ""; // do nothing + re2c:yyfill:enable = 0; + re2c:tags = 1; + + num = [0-9]+; + + @t1 num @t2 ("." #t3 num #t4)* [\x00] { + let vers = [Number(yyinput.substring(t1, t2))] + for (let i = 0; i < t3.length; ++i) { + vers.push(Number(yyinput.substring(t3[i], t4[i]))) + } + return vers + } + * { return null } + */ +} + +assert.deepEqual(parse("1\0"), [1]) +assert.deepEqual(parse("1.2.3.4.5.6.7\0"), [1, 2, 3, 4, 5, 6, 7]) +assert.deepEqual(parse("1.2.\0"), null) diff --git a/examples/php/submatch/03_captures.js b/examples/php/submatch/03_captures.js new file mode 100644 index 000000000..0b0a210ee --- /dev/null +++ b/examples/php/submatch/03_captures.js @@ -0,0 +1,231 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + let yytl0 +let yytl1 +let yytl2 +let yytl3 +let yytr0 +let yytr1 +let yytr2 +let yytr3 + + + // Intermediate tag variables used by the lexer (must be autogenerated). + let yyt1 +let yyt2 +let yyt3 +let yyt4 +let yyt5 + + + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt1 = yycursor; + yycursor += 1; + yystate = 3 + continue yyl + default: + yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt2 = yycursor; + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 5: + yycursor = yymarker; + yystate = 2 + continue yyl + case 6: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 5 + continue yyl + } + case 7: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + yyt3 = yycursor; + yyt4 = -1; + yyt5 = -1; + yycursor += 1; + yystate = 8 + continue yyl + case 0x2E: + yyt3 = yycursor; + yyt5 = yycursor; + yycursor += 1; + yystate = 9 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 8: + yytl1 = yyt1; + yytl2 = yyt2; + yytr2 = yyt3; + yytl3 = yyt5; + yytr3 = yyt4; + yytl0 = yyt1; + yytr0 = yycursor; + yytr1 = yyt2; + yytr1 -= 1; + { + return { + major: Number(yyinput.substring(yytl1, yytr1)), + minor: Number(yyinput.substring(yytl2, yytr2)), + patch: yytl3 == -1 ? 0 : Number(yyinput.substring(yytl3 + 1, yytr3)) + } + } + case 9: + yych = yyinput.charCodeAt(yycursor) + if (yych <= 0x00) { + yystate = 5 + continue yyl + } + yystate = 11 + continue yyl + case 10: + yych = yyinput.charCodeAt(yycursor) + yystate = 11 + continue yyl + case 11: + switch (yych) { + case 0x00: + yyt4 = yycursor; + yycursor += 1; + yystate = 8 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 5 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/examples/php/submatch/03_captures.re b/examples/php/submatch/03_captures.re new file mode 100644 index 000000000..75302003a --- /dev/null +++ b/examples/php/submatch/03_captures.re @@ -0,0 +1,33 @@ +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +function parse(yyinput) { + let yycursor = 0 + + // Final tag variables available in semantic action. + /*!svars:re2c format = "let @@\n"; */ + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = "let @@\n"; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:captvars = 1; + + num = [0-9]+; + + (num) "." (num) ("." num)? [\x00] { + return { + major: Number(yyinput.substring(yytl1, yytr1)), + minor: Number(yyinput.substring(yytl2, yytr2)), + patch: yytl3 == -1 ? 0 : Number(yyinput.substring(yytl3 + 1, yytr3)) + } + } + * { return null } + */ +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/examples/php/submatch/04_posix_captures.js b/examples/php/submatch/04_posix_captures.js new file mode 100644 index 000000000..bfb1c0fd0 --- /dev/null +++ b/examples/php/submatch/04_posix_captures.js @@ -0,0 +1,235 @@ +// Generated by re2js +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +// Maximum number of capturing groups among all rules. +const YYMAXNMATCH = 4 + + +function parse(yyinput) { + let yycursor = 0 + + // A list for capturing parentheses (twice the number of groups). + let yynmatch + let yypmatch = Array(YYMAXNMATCH * 2).fill(null) + + // Intermediate tag variables used by the lexer (must be autogenerated). + let yyt1 +let yyt2 +let yyt3 +let yyt4 +let yyt5 + + + +{ + let yych = 0 + let yystate = 0 + yyl: while (true) { + switch (yystate) { + case 0: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt1 = yycursor; + yycursor += 1; + yystate = 3 + continue yyl + default: + yycursor += 1; + yystate = 1 + continue yyl + } + case 1: + yystate = 2 + continue yyl + case 2: + { return null } + case 3: + yymarker = yycursor; + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 2 + continue yyl + } + case 4: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yyt2 = yycursor; + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 5: + yycursor = yymarker; + yystate = 2 + continue yyl + case 6: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x2E: + yycursor += 1; + yystate = 4 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 6 + continue yyl + default: + yystate = 5 + continue yyl + } + case 7: + yych = yyinput.charCodeAt(yycursor) + switch (yych) { + case 0x00: + yyt3 = yycursor; + yyt4 = -1; + yyt5 = -1; + yycursor += 1; + yystate = 8 + continue yyl + case 0x2E: + yyt3 = yycursor; + yyt5 = yycursor; + yycursor += 1; + yystate = 9 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 7 + continue yyl + default: + yystate = 5 + continue yyl + } + case 8: + yynmatch = 4 + yypmatch[2] = yyt1; + yypmatch[4] = yyt2; + yypmatch[5] = yyt3; + yypmatch[6] = yyt5; + yypmatch[7] = yyt4; + yypmatch[0] = yyt1; + yypmatch[1] = yycursor; + yypmatch[3] = yyt2; + yypmatch[3] -= 1; + { + // `yynmatch` is the number of capturing groups + assert.equal(yynmatch, 4) + + // Even `yypmatch` values are for opening parentheses, odd values + // are for closing parentheses, the first group is the whole match. + return { + major: Number(yyinput.substring(yypmatch[2], yypmatch[3])), + minor: Number(yyinput.substring(yypmatch[4], yypmatch[5])), + patch: yypmatch[6] == -1 ? 0 + : Number(yyinput.substring(yypmatch[6] + 1, yypmatch[7])) + } + } + case 9: + yych = yyinput.charCodeAt(yycursor) + if (yych <= 0x00) { + yystate = 5 + continue yyl + } + yystate = 11 + continue yyl + case 10: + yych = yyinput.charCodeAt(yycursor) + yystate = 11 + continue yyl + case 11: + switch (yych) { + case 0x00: + yyt4 = yycursor; + yycursor += 1; + yystate = 8 + continue yyl + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + yycursor += 1; + yystate = 10 + continue yyl + default: + yystate = 5 + continue yyl + } + default: + throw "internal lexer error" + } + } +} + +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/examples/php/submatch/04_posix_captures.re b/examples/php/submatch/04_posix_captures.re new file mode 100644 index 000000000..92b16640b --- /dev/null +++ b/examples/php/submatch/04_posix_captures.re @@ -0,0 +1,43 @@ +// re2js $INPUT -o $OUTPUT + +const assert = require('assert'); + +// Maximum number of capturing groups among all rules. +/*!maxnmatch:re2c*/ + +function parse(yyinput) { + let yycursor = 0 + + // A list for capturing parentheses (twice the number of groups). + let yynmatch + let yypmatch = Array(YYMAXNMATCH * 2).fill(null) + + // Intermediate tag variables used by the lexer (must be autogenerated). + /*!stags:re2c format = "let @@\n"; */ + + /*!re2c + re2c:yyfill:enable = 0; + re2c:posix-captures = 1; + + num = [0-9]+; + + (num) "." (num) ("." num)? [\x00] { + // `yynmatch` is the number of capturing groups + assert.equal(yynmatch, 4) + + // Even `yypmatch` values are for opening parentheses, odd values + // are for closing parentheses, the first group is the whole match. + return { + major: Number(yyinput.substring(yypmatch[2], yypmatch[3])), + minor: Number(yyinput.substring(yypmatch[4], yypmatch[5])), + patch: yypmatch[6] == -1 ? 0 + : Number(yyinput.substring(yypmatch[6] + 1, yypmatch[7])) + } + } + * { return null } + */ +} + +assert.deepEqual(parse("23.34\0"), {major: 23, minor: 34, patch: 0}) +assert.deepEqual(parse("1.2.99999\0"), {major: 1, minor: 2, patch: 99999}) +assert.deepEqual(parse("1.a\0"), null) diff --git a/include/syntax/php b/include/syntax/php new file mode 100644 index 000000000..f84f1b13d --- /dev/null +++ b/include/syntax/php @@ -0,0 +1,381 @@ +// supported feature lists ----------------------------------------------------- + +supported_apis = ["simple", "generic", "record"]; +supported_api_styles = ["free-form"]; +supported_code_models = ["loop-switch"]; +supported_targets = ["code", "dot"]; +supported_features = ["nested-ifs", "bitmaps", "tags", "captures", "captvars"]; + + +// language-specific options --------------------------------------------------- + +semicolons = 0; +backtick_quoted_strings = 1; +single_quoted_strings = 1; +indentation_sensitive = 0; +wrap_blocks_in_braces = 1; + + +// immutable configurations (command-line only options) ------------------------ + +re2c:target = code; +re2c:code-model = loop-switch; +re2c:input-encoding = ascii; +re2c:date = 1; +re2c:version = 1; +re2c:conditions = 0; +re2c:storable-state = 0; +re2c:flex-syntax = 0; +re2c:verbose = 0; +re2c:line-dirs = 0; + + +// mutable configurations ------------------------------------------------------ + +re2c:api = simple; +re2c:api:style = free-form; +re2c:api:sigil = "@@"; +re2c:YYGETCOND:naked = 0; +re2c:YYSETCOND:naked = 0; +re2c:YYSETCOND@cond = "@@"; +re2c:YYGETSTATE:naked = 0; +re2c:YYSETSTATE:naked = 0; +re2c:YYSETSTATE@state = "@@"; +re2c:YYFILL@len = "@@"; +re2c:YYFILL:naked = 0; +re2c:YYFN = [";"]; +re2c:yyfn:sep = ";"; +re2c:yycond = "yycond"; +re2c:yyctable = "yyctable"; +re2c:yyaccept = "yyaccept"; +re2c:yytarget = "yytarget"; +re2c:yystate = "yystate"; +re2c:yynmatch = "yynmatch"; +re2c:yypmatch = "yypmatch"; +re2c:yyrecord = "yyrecord"; +re2c:yych = "yych"; +re2c:yych:conversion = 0; +re2c:yych:literals = hex; +re2c:yych:emit = 1; +re2c:yybm = "yybm"; +re2c:yybm:hex = 0; +re2c:yyfill = ""; +re2c:yystable = ""; // deprecated +re2c:header = ""; +re2c:eof = -1; +re2c:sentinel = -1; +re2c:yyfill:enable = 1; +re2c:yyfill:parameter = 1; +re2c:yyfill:check = 1; +re2c:tags = 0; +re2c:tags:prefix = "yyt"; +re2c:captures = 0; +re2c:captvars = 0; +re2c:posix-captures = 0; +re2c:posix-captvars = 0; +re2c:invert-captures = 0; +re2c:cond:abort = 0; +re2c:cond:prefix = "yyc_"; +re2c:cond:enumprefix = "YYC_"; +re2c:cond:divider@cond = "@@"; +re2c:cond:goto@cond = "@@"; +re2c:state:abort = 1; +re2c:state:nextlabel = 0; +re2c:bit-vectors = 0; +re2c:debug-output = 0; +re2c:computed-gotos = 0; +re2c:computed-gotos:threshold = 9; +re2c:nested-ifs = 0; +re2c:case-insensitive = 0; +re2c:case-inverted = 0; +re2c:case-ranges = 0; +re2c:unsafe = 0; +re2c:monadic = 0; +re2c:encoding:ebcdic = 0; +re2c:encoding:utf32 = 0; +re2c:encoding:ucs2 = 0; +re2c:encoding:utf16 = 0; +re2c:encoding:utf8 = 0; +re2c:encoding-policy = ignore; +re2c:empty-class = match-empty; +re2c:indent:string = " "; +re2c:indent:top = 0; +re2c:label:prefix = ""; +re2c:label:yyfill = ""; +re2c:label:yyloop = "yyl"; +re2c:label:yyNext = ""; +re2c:label:start = 0; + + +// mutable code configuration -------------------------------------------------- + +re2c:YYBACKUP = "yybackup"; +re2c:YYBACKUPCTX = "yybackupctx"; +re2c:YYCONDTYPE = "YYCond"; +re2c:YYCOPYMTAG = sigil "{lhs} = " sigil "{rhs}"; +re2c:YYCOPYSTAG = sigil "{lhs} = " sigil "{rhs}"; +re2c:YYCTYPE = "YYChar"; +re2c:YYCTXMARKER = (.api.record ? yyrecord ".") "yyctxmarker"; +re2c:YYCURSOR = (.api.record ? yyrecord ".") "yycursor"; +re2c:YYDEBUG = "yydebug"; +re2c:YYFILL = "yyfill"; +re2c:YYGETACCEPT = sigil "{var}"; +re2c:YYGETCOND = "yygetcond"; +re2c:YYGETSTATE = "yygetstate"; +re2c:YYINPUT = (.api.record ? yyrecord ".") "yyinput"; +re2c:YYLESSTHAN = "yylessthan"; +re2c:YYLIMIT = (.api.record ? yyrecord ".") "yylimit"; +re2c:YYMARKER = (.api.record ? yyrecord ".") "yymarker"; +re2c:YYMAXFILL = "YYMAXFILL"; +re2c:YYMAXNMATCH = "YYMAXNMATCH"; +re2c:YYMTAGN = "yymtagn"; +re2c:YYMTAGP = "yymtagp"; +re2c:YYPEEK = (.api.generic ? "yypeek" : "charCodeAt"); +re2c:YYRESTORE = "yyrestore"; +re2c:YYRESTORECTX = "yyrestorectx"; +re2c:YYRESTORETAG = "yyrestoretag"; +re2c:YYSETACCEPT = sigil "{var} = " sigil "{val}"; +re2c:YYSETCOND = "yysetcond"; +re2c:YYSETSTATE = "yysetstate"; +re2c:YYSHIFT = "yyshift"; +re2c:YYSHIFTSTAG = "yyshiftstag"; +re2c:YYSHIFTMTAG = "yyshiftmtag"; +re2c:YYSKIP = "yyskip"; +re2c:YYSTAGN = "yystagn"; +re2c:YYSTAGP = "yystagp"; +re2c:tags:expression = (.api.record ? yyrecord ".") sigil; +re2c:tags:negative = (.api.generic ? "@@" : "-1"); +re2c:cond:divider = ""; +re2c:cond:goto = ""; + + +// code templates -------------------------------------------------------------- + +code:var_local = topindent "let " name " = " init nl; +code:var_global = code:var_local; + +code:const_local = topindent "const " name " = " init nl; +code:const_global = code:const_local; + +code:array_local = + topindent "const " name " = [" nl indent + [row: topindent [elem{0:-2}: elem ", "] [elem{-1}: elem ","] nl] + dedent topindent "]" nl; + +code:array_global = ; + +code:array_elem = array "[" index "]"; + +code:enum = [elem: topindent "const " elem " = " init nl]; + +code:enum_elem = name; + +code:assign = topindent lhs " = " rhs nl; + +code:type_int = ; +code:type_uint = ; +code:type_yybm = ; +code:type_yytarget = ; + +code:cmp_eq = "==="; +code:cmp_ne = "!=="; +code:cmp_lt = "<"; +code:cmp_gt = ">"; +code:cmp_le = "<="; +code:cmp_ge = ">="; + +code:if_then_else = + [branch{0}: topindent "if (" cond ") {" nl + indent [stmt: stmt] dedent] + [branch{1:-1}: topindent "} else " (.cond ? "if (" cond ") ") "{" nl + indent [stmt: stmt] dedent] + topindent "}" nl; + +code:if_then_else_oneline = ; + +code:switch = + topindent "switch (" expr ") {" nl + indent [case: case] dedent + topindent "}" nl; + +code:switch_cases = + [case: case nl] + indent [stmt: stmt] dedent; + +code:switch_cases_oneline = ; + +code:switch_case_range = + [val{0:-2}: topindent "case " val ":" nl] + [val{-1}: topindent "case " val ":"]; + +code:switch_case_default = + topindent "default:"; + +code:loop = + topindent (.loop_label ? label ": ") "while (true) {" nl + indent [stmt: stmt] dedent + topindent "}" nl; + +code:continue = topindent "continue" (.loop_label ? " " label) nl; + +code:goto = ; + +code:fndecl = ; +code:fndef = ; +code:fncall = ; +code:tailcall = ; +code:recursive_functions = ; + +code:line_info = ; + +code:fingerprint = + "// Generated by re2js" (.version ? " " version) (.date ? " on " date) nl; + +code:abort = topindent "throw \"internal lexer error\"" nl; + +code:yydebug = + topindent (.api.generic + ? YYDEBUG + : (.api.record + ? YYDEBUG "(" yyrecord ");" + : YYDEBUG "(" state ", " yych ");" + )) nl; + +code:yypeek = + topindent (.code_model.recursive_functions ? YYCTYPE " ") yych " = " (.api.generic + ? YYPEEK + : YYINPUT "." YYPEEK "(" YYCURSOR ")" + ) nl; + +code:yyskip = + topindent (.api.generic + ? YYSKIP + : YYCURSOR " += 1;" + ) nl; + +code:yybackup = + topindent (.api.generic + ? YYBACKUP + : YYMARKER " = " YYCURSOR ";" + ) nl; + +code:yybackupctx = + topindent (.api.generic + ? YYBACKUPCTX + : YYCTXMARKER " = " YYCURSOR ";" + ) nl; + +code:yyskip_yypeek = ; +code:yypeek_yyskip = ; +code:yyskip_yybackup = ; +code:yybackup_yyskip = ; +code:yybackup_yypeek = ; +code:yyskip_yybackup_yypeek = ; +code:yybackup_yypeek_yyskip = ; + +code:yyrestore = + topindent (.api.generic + ? YYRESTORE + : YYCURSOR " = " YYMARKER ";" + ) nl; + +code:yyrestorectx = + topindent (.api.generic + ? YYRESTORECTX + : YYCURSOR " = " YYCTXMARKER ";" + ) nl; + +code:yyrestoretag = + topindent (.api.generic + ? YYRESTORETAG + : YYCURSOR " = " tag ";" + ) nl; + +code:yyshift = + topindent (.api.generic + ? YYSHIFT + : YYCURSOR " -= " offset ";" + ) nl; + +code:yyshiftstag = + topindent (.nested ? "if (" tag " != " neg ") ") (.api.generic + ? YYSHIFTSTAG + : tag " -= " offset ";" + ) nl; + +code:yyshiftmtag = + topindent YYSHIFTMTAG nl; + +code:yystagp = + topindent (.api.generic + ? YYSTAGP + : tag " = " YYCURSOR ";" + ) nl; + +code:yymtagp = + topindent YYMTAGP nl; + +code:yystagn = + topindent (.api.generic + ? YYSTAGN + : tag " = " neg ";" + ) nl; + +code:yymtagn = + topindent YYMTAGN nl; + +code:yycopystag = + topindent (.api.generic + ? YYCOPYSTAG + : lhs " = " rhs ";" + ) nl; + +code:yycopymtag = + topindent (.api.generic + ? YYCOPYMTAG + : lhs " = " rhs ";" + ) nl; + +code:yygetaccept = + (.api.generic + ? YYGETACCEPT + : (.api.record & .storable_state ? yyrecord ".") var); + +code:yysetaccept = + topindent (.api.generic + ? YYSETACCEPT + : (.api.record & .storable_state ? yyrecord ".") var " = " val ";" + ) nl; + +code:yygetcond = + (.api.generic + ? YYGETCOND + : (.api.record ? yyrecord ".") var); + +code:yysetcond = + topindent (.api.generic + ? YYSETCOND + : (.api.record ? yyrecord ".") var " = " val ";" + ) nl; + +code:yygetstate = + (.api.generic + ? YYGETSTATE + : (.api.record ? yyrecord ".") var); + +code:yysetstate = + topindent (.api.generic + ? YYSETSTATE + : (.api.record ? yyrecord ".") var " = " val ";" + ) nl; + +code:yylessthan = + (.api.generic + ? YYLESSTHAN + : (.many + ? "(" YYLIMIT " - " YYCURSOR ") < " need + : YYLIMIT " <= " YYCURSOR)); + +code:yybm_filter = yych " & ~0xFF"; + +code:yybm_match = "(" yybm "[" offset "+" yych "] & " mask ") !== 0"; diff --git a/run_tests.py.in b/run_tests.py.in index c3c6d676a..4efe136cf 100644 --- a/run_tests.py.in +++ b/run_tests.py.in @@ -299,7 +299,7 @@ def copy_tests(tests, dst): # type: (list, str) -> None def clean_test_tree(path): # type: (str) -> None """Clean the test tree from the not needed files.""" files = set([str(f) for f in Path(path).rglob('*.*')]) - for ext in ['re', 'c', 'h', 'd', 'go', 'hs', 'java', 'js', 'ml', 'py', 'rs', 'v', 'zig', 'inc']: + for ext in ['re', 'c', 'h', 'd', 'go', 'hs', 'java', 'js', 'ml', 'php', 'py', 'rs', 'v', 'zig', 'inc']: files -= set([str(f) for f in Path(path).rglob(f'*.{ext}')]) remove(*[f for f in files if os.path.isfile(f)]) diff --git a/src/constants.h b/src/constants.h index 04f5741bb..8fbab9895 100644 --- a/src/constants.h +++ b/src/constants.h @@ -20,6 +20,7 @@ enum class Lang: uint32_t { JAVA, JS, OCAML, + PHP, PYTHON, RUST, V, diff --git a/src/options/parse_opts.re b/src/options/parse_opts.re index c2631c26d..885231b6a 100644 --- a/src/options/parse_opts.re +++ b/src/options/parse_opts.re @@ -247,6 +247,7 @@ opt_lang: /*!local:re2c "java" end { *lang = Lang::JAVA; goto opt; } "js" end { *lang = Lang::JS; goto opt; } "ocaml" end { *lang = Lang::OCAML; goto opt; } + "php" end { *lang = Lang::PHP; goto opt; } "python" end { *lang = Lang::PYTHON; goto opt; } "rust" end { *lang = Lang::RUST; goto opt; } "v" end { *lang = Lang::V; goto opt; } diff --git a/src/parse/conf_parser.ypp b/src/parse/conf_parser.ypp index 1cf492d50..e856720d2 100644 --- a/src/parse/conf_parser.ypp +++ b/src/parse/conf_parser.ypp @@ -10,6 +10,7 @@ #include "src/default_syntax_java.h" #include "src/default_syntax_js.h" #include "src/default_syntax_ocaml.h" +#include "src/default_syntax_php.h" #include "src/default_syntax_python.h" #include "src/default_syntax_rust.h" #include "src/default_syntax_v.h" @@ -205,6 +206,7 @@ Ret Input::load_syntax_config(Opt& opts, Lang lang) { case Lang::JAVA: src = DEFAULT_SYNTAX_JAVA; break; case Lang::JS: src = DEFAULT_SYNTAX_JS; break; case Lang::OCAML: src = DEFAULT_SYNTAX_OCAML; break; + case Lang::PHP: src = DEFAULT_SYNTAX_PHP; break; case Lang::PYTHON: src = DEFAULT_SYNTAX_PYTHON; break; case Lang::RUST: src = DEFAULT_SYNTAX_RUST; break; case Lang::V: src = DEFAULT_SYNTAX_V; break;