Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions kumir/KumirLexer.g4
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// KumirLexer.g4
// ANTLR v4 Lexer Grammar for the Kumir language.
// Source: Refined based on official documentation and extensive testing
// against K.Y. Polyakov's examples. Developed collaboratively.
// Author: [Your Name/GitHub Handle]
// License: MIT License

lexer grammar KumirLexer;

// --- Keywords (Core Language) ---
// Keywords are case-insensitive (both lowercase and uppercase Cyrillic are matched).
MODULE : ('модуль' | 'МОДУЛЬ');
ENDMODULE : ('конец' WS 'модуля' | 'КОНЕЦ' WS 'МОДУЛЯ' | 'конецмодуля' | 'КОНЕЦМОДУЛЯ' | 'конец_модуля' | 'КОНЕЦ_МОДУЛЯ'); // Handles space, no space, underscore
ALG_HEADER : ('алг' | 'АЛГ');
ALG_BEGIN : ('нач' | 'НАЧ');
ALG_END : ('кон' | 'КОН');
PRE_CONDITION : ('дано' | 'ДАНО');
POST_CONDITION : ('надо' | 'НАДО');
ASSERTION : ('утв' | 'УТВ');
LOOP : ('нц' | 'НЦ');
ENDLOOP_COND : ('кц' WS 'при') | 'кц_при' | ('КЦ' WS 'ПРИ') | 'КЦ_ПРИ'; // Handles space and underscore for 'кц при'
ENDLOOP : ('кц' | 'КЦ');
IF : ('если' | 'ЕСЛИ');
THEN : ('то' | 'ТО');
ELSE : ('иначе' | 'ИНАЧЕ');
FI : ('все' | 'ВСЕ'); // End of 'if' or 'switch' block
SWITCH : ('выбор' | 'ВЫБОР');
CASE : ('при' | 'ПРИ'); // Used in 'switch' and 'endloop conditional'
INPUT : ('ввод' | 'ВВОД');
OUTPUT : ('вывод' | 'ВЫВОД');
ASSIGN : ':='; // Assignment operator
EXIT : ('выход' | 'ВЫХОД');
PAUSE : ('пауза' | 'ПАУЗА');
STOP : ('стоп' | 'СТОП');
IMPORT : ('использовать' | 'ИСПОЛЬЗОВАТЬ');
FOR : ('для' | 'ДЛЯ');
WHILE : ('пока' | 'ПОКА');
TIMES : ('раз' | 'РАЗ'); // Used in 'N times' loop
FROM : ('от' | 'ОТ'); // Used in 'for' loop
TO : ('до' | 'ДО'); // Used in 'for' loop
STEP : ('шаг' | 'ШАГ'); // Used in 'for' loop
NEWLINE_CONST : ('нс' | 'НС'); // Newline constant for I/O
NOT : ('не' | 'НЕ'); // Logical NOT
AND : ('и' | 'И'); // Logical AND
OR : ('или' | 'ИЛИ'); // Logical OR
OUT_PARAM : ('рез' | 'РЕЗ'); // Output parameter modifier
IN_PARAM : ('арг' | 'АРГ'); // Input parameter modifier
INOUT_PARAM : ('аргрез' | 'АРГРЕЗ' | 'арг' WS 'рез' | 'АРГ' WS 'РЕЗ' | 'арг_рез' | 'АРГ_РЕЗ'); // In/Out parameter modifier (various spellings)
RETURN_VALUE : ('знач' | 'ЗНАЧ'); // Special variable for function return value

// --- Data Types ---
INTEGER_TYPE : ('цел' | 'ЦЕЛ');
REAL_TYPE : ('вещ' | 'ВЕЩ');
BOOLEAN_TYPE : ('лог' | 'ЛОГ');
CHAR_TYPE : ('сим' | 'СИМ');
STRING_TYPE : ('лит' | 'ЛИТ');
TABLE_SUFFIX : 'таб' | 'ТАБ'; // Suffix for tables (e.g., "цел таб")
// Actor-specific types
KOMPL_TYPE : ('компл' | 'КОМПЛ'); // Complex numbers
COLOR_TYPE : ('цвет' | 'ЦВЕТ'); // Color type (Painter actor)
SCANCODE_TYPE : ('сканкод' | 'СКАНКОД'); // Key scancode (Keyboard actor)
FILE_TYPE : ('файл' | 'ФАЙЛ'); // File type
// Explicit array/table types (handle variations with space, no space, underscore)
INTEGER_ARRAY_TYPE : ('цел' WS? 'таб' | 'ЦЕЛ' WS? 'ТАБ' | 'цел_таб' | 'ЦЕЛ_ТАБ');
REAL_ARRAY_TYPE : ('вещ' WS? 'таб' | 'ВЕЩ' WS? 'ТАБ' | 'вещ_таб' | 'ВЕЩ_ТАБ');
CHAR_ARRAY_TYPE : ('сим' WS? 'таб' | 'СИМ' WS? 'ТАБ' | 'сим_таб' | 'СИМ_ТАБ');
STRING_ARRAY_TYPE : ('лит' WS? 'таб' | 'ЛИТ' WS? 'ТАБ' | 'лит_таб' | 'ЛИТ_ТАБ');
BOOLEAN_ARRAY_TYPE : ('лог' WS? 'таб' | 'ЛОГ' WS? 'ТАБ' | 'лог_таб' | 'ЛОГ_ТАБ');

// --- Constants ---
TRUE : ('да' | 'ДА');
FALSE : ('нет' | 'НЕТ');
// Color constants
PROZRACHNIY : ('прозрачный' | 'ПРОЗРАЧНЫЙ');
BELIY : ('белый' | 'БЕЛЫЙ');
CHERNIY : ('чёрный' | 'черный' | 'ЧЁРНЫЙ' | 'ЧЕРНЫЙ');
SERIY : ('серый' | 'СЕРЫЙ');
FIOLETOVIY : ('фиолетовый' | 'ФИОЛЕТОВЫЙ');
SINIY : ('синий' | 'СИНИЙ');
GOLUBOY : ('голубой' | 'ГОЛУБОЙ');
ZELENIY : ('зелёный' | 'зеленый' | 'ЗЕЛЁНЫЙ' | 'ЗЕЛЕНЫЙ');
ZHELTIY : ('жёлтый' | 'желтый' | 'ЖЁЛТЫЙ' | 'ЖЕЛТЫЙ');
ORANZHEVIY : ('оранжевый' | 'ОРАНЖЕВЫЙ');
KRASNIY : ('красный' | 'КРАСНЫЙ');

// --- Operators ---
POWER : '**'; // Power operator
GE : '>=' | '≥'; // Greater than or equal
LE : '<=' | '≤'; // Less than or equal
NE : '<>' | '≠'; // Not equal
PLUS : '+';
MINUS : '-';
MUL : '*';
DIV : '/';
EQ : '='; // Equal (used in comparisons and initialization)
LT : '<';
GT : '>';
LPAREN : '('; // Left parenthesis
RPAREN : ')'; // Right parenthesis
LBRACK : '['; // Left square bracket (arrays)
RBRACK : ']'; // Right square bracket (arrays)
LBRACE : '{'; // Left curly brace (array literals)
RBRACE : '}'; // Right curly brace (array literals)
COMMA : ','; // Comma
COLON : ':'; // Colon (array bounds, output format)
SEMICOLON : ';'; // Semicolon (statement separator)
ATAT : '@@'; // Double at-sign (special teacher functions)
AT : '@'; // At-sign (used in identifiers)

// --- Literals ---
// Character literal: single character in single quotes
CHAR_LITERAL : '\'' ( EscapeSequence | ~['\\\r\n] ) '\'' ;
// String literal: sequence of characters in double or single quotes
STRING : '"' ( EscapeSequence | ~["\\\r\n] )*? '"'
| '\'' ( EscapeSequence | ~['\\\r\n] )*? '\''
;
// Real literal: with decimal point or in exponential form
REAL : (DIGIT+ '.' DIGIT* | '.' DIGIT+) ExpFragment? // 123. , .5 , 123.45
| DIGIT+ ExpFragment // 123e4
;
// Integer literal: decimal or hexadecimal (starts with $)
INTEGER : DecInteger | HexInteger ;

// --- Identifier ---
// Variable, algorithm, module name, etc.
// Starts with a letter, followed by letters, digits, '_', or '@'.
ID : LETTER (LETTER | DIGIT | '_' | '@')* ;

// --- Comments ---
// Comments starting with '|' or '#' are ignored by the parser.
// The comment text is sent to the HIDDEN channel.
// Does NOT consume the trailing newline.
LINE_COMMENT : '|' ~[\r\n]* -> channel(HIDDEN); // Single-line comment
DOC_COMMENT : '#' ~[\r\n]* -> channel(HIDDEN); // Documentation comment

// --- Whitespace ---
// Spaces, tabs, newlines are skipped by the lexer.
WS : [ \t\r\n]+ -> skip;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the previously introduced WS_FRAGMENT:

Suggested change
WS : [ \t\r\n]+ -> skip;
WS : WS_FRAGMENT+ -> skip;


// --- Fragments ---
// Fragments are helper rules for the lexer and do not produce tokens themselves.
fragment DIGIT : [0-9];
fragment HEX_DIGIT : [0-9a-fA-F];
// Letter (Latin or Cyrillic, including ё/Ё)
fragment LETTER : [a-zA-Zа-яА-ЯёЁ];
fragment DecInteger : DIGIT+;
fragment HexInteger : '$' HEX_DIGIT+;
// Exponential part of a real number (e.g., e+10, E-5, е-3)
fragment ExpFragment: [eEеЕ] [+-]? DIGIT+;
// Escape sequences within strings/chars (e.g., \n, \t, \')
fragment EscapeSequence
: '\\' [btnfr"'\\] // \b, \t, \n, \f, \r, \", \', \\
;
// End of KumirLexer.g4
Loading
Loading