|
2 | 2 | # See https://llvm.org/LICENSE.txt for license information. |
3 | 3 | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
4 | 4 |
|
5 | | -from pygments.lexer import RegexLexer |
| 5 | +from pygments.lexer import RegexLexer, bygroups, include, using |
6 | 6 | from pygments.token import * |
| 7 | +import re |
7 | 8 |
|
8 | 9 |
|
9 | 10 | class MlirLexer(RegexLexer): |
| 11 | + """Pygments lexer for MLIR. |
| 12 | +
|
| 13 | + This lexer focuses on accurate tokenization of common MLIR constructs: |
| 14 | + - SSA values (%%... / %...) |
| 15 | + - attribute and type aliases (#name =, !name =) |
| 16 | + - types (builtin and dialect types, parametric types) |
| 17 | + - attribute dictionaries and nested containers to a reasonable depth |
| 18 | + - numbers (ints, floats with exponents, hex) |
| 19 | + - strings with common escapes |
| 20 | + - line comments (// ...) |
| 21 | + - block labels (^foo) and operations |
| 22 | + """ |
| 23 | + |
10 | 24 | name = "MLIR" |
11 | 25 | aliases = ["mlir"] |
12 | 26 | filenames = ["*.mlir"] |
13 | 27 |
|
| 28 | + flags = re.MULTILINE |
| 29 | + |
| 30 | + class VariableList(RegexLexer): |
| 31 | + """Lexer for lists of SSA variables separated by commas.""" |
| 32 | + |
| 33 | + tokens = { |
| 34 | + "root": [ |
| 35 | + (r"\s+", Text), |
| 36 | + (r",", Punctuation), |
| 37 | + (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable), |
| 38 | + ] |
| 39 | + } |
| 40 | + |
14 | 41 | tokens = { |
15 | 42 | "root": [ |
16 | | - (r"%[a-zA-Z0-9_]+", Name.Variable), |
17 | | - (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function), |
18 | | - (r"\^[a-zA-Z0-9_]+", Name.Label), |
19 | | - (r"#[a-zA-Z0-9_]+", Name.Constant), |
20 | | - (r"![a-zA-Z0-9_]+", Keyword.Type), |
21 | | - (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity), |
22 | | - (r"memref[^.]", Keyword.Type), |
23 | | - (r"index", Keyword.Type), |
24 | | - (r"i[0-9]+", Keyword.Type), |
25 | | - (r"f[0-9]+", Keyword.Type), |
| 43 | + # Comments |
| 44 | + (r"//.*?$", Comment.Single), |
| 45 | + # operation name with assignment: %... = op.name |
| 46 | + ( |
| 47 | + r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b", |
| 48 | + bygroups(Text, using(VariableList), Operator, Text, Name.Builtin), |
| 49 | + ), |
| 50 | + # operation name without result |
| 51 | + (r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)), |
| 52 | + # Attribute alias definition: #name = |
| 53 | + ( |
| 54 | + r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)", |
| 55 | + bygroups(Text, Name.Constant, Text, Operator), |
| 56 | + ), |
| 57 | + # Type alias definition: !name = |
| 58 | + ( |
| 59 | + r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)", |
| 60 | + bygroups(Text, Keyword.Type, Text, Operator), |
| 61 | + ), |
| 62 | + # SSA values (uses) |
| 63 | + (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable), |
| 64 | + # attribute refs, constants and named attributes |
| 65 | + (r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant), |
| 66 | + # symbol refs / function-like names |
| 67 | + (r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function), |
| 68 | + # blocks |
| 69 | + (r"\^[A-Za-z0-9_\$\.\-]+", Name.Label), |
| 70 | + # types by exclamation or builtin names |
| 71 | + (r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type), |
| 72 | + # NOTE: please sync changes to corresponding builtin type rule in "angled-type" |
| 73 | + (r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type), |
| 74 | + # container-like dialect types (tensor<...>, memref<...>, vector<...>) |
| 75 | + ( |
| 76 | + r"\b(complex|memref|tensor|tuple|vector)\s*(<)", |
| 77 | + bygroups(Keyword.Type, Punctuation), |
| 78 | + "angled-type", |
| 79 | + ), |
| 80 | + # affine constructs |
| 81 | + (r"\b(affine_map|affine_set)\b", Keyword.Reserved), |
| 82 | + # common builtin operators / functions inside affine_map |
| 83 | + (r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other), |
| 84 | + # identifiers / bare words |
| 85 | + (r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other), |
| 86 | + # numbers: hex, float (with exponent), integer |
| 87 | + (r"\b0x[0-9A-Fa-f]+\b", Number.Hex), |
| 88 | + (r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float), |
| 89 | + (r"\b[0-9]+\b", Number.Integer), |
| 90 | + # strings |
| 91 | + (r'"', String.Double, "string"), |
| 92 | + # punctuation and arrow-like tokens |
| 93 | + (r"->|>=|<=|\>=|\<=|\->|\=>", Operator), |
| 94 | + (r"[()\[\]{}<>,.:=]", Punctuation), |
| 95 | + # operators |
| 96 | + (r"[-+*/%]", Operator), |
| 97 | + ], |
| 98 | + # string state with common escapes |
| 99 | + "string": [ |
| 100 | + (r'\\[ntr"\\]', String.Escape), |
| 101 | + (r'[^"\\]+', String.Double), |
| 102 | + (r'"', String.Double, "#pop"), |
| 103 | + ], |
| 104 | + # angled-type content |
| 105 | + "angled-type": [ |
| 106 | + # match nested '<' and '>' |
| 107 | + (r"<", Punctuation, "#push"), |
| 108 | + (r">", Punctuation, "#pop"), |
| 109 | + # dimensions like 3x or 3x3x... and standalone numbers: |
| 110 | + # - match numbers that are followed by an 'x' (dimension separator) |
| 111 | + (r"([0-9]+)(?=(?:x))", Number.Integer), |
| 112 | + # - match bare numbers (sizes) |
26 | 113 | (r"[0-9]+", Number.Integer), |
27 | | - (r"[0-9]*\.[0-9]*", Number.Float), |
28 | | - (r'"[^"]*"', String.Double), |
29 | | - (r"affine_map", Keyword.Reserved), |
30 | | - # TODO: this should be within affine maps only |
31 | | - (r"\+-\*\/", Operator), |
32 | | - (r"floordiv", Operator.Word), |
33 | | - (r"ceildiv", Operator.Word), |
34 | | - (r"mod", Operator.Word), |
35 | | - (r"()\[\]<>,{}", Punctuation), |
36 | | - (r"\/\/.*\n", Comment.Single), |
37 | | - ] |
| 114 | + # dynamic dimension '?' |
| 115 | + (r"\?", Name.Integer), |
| 116 | + # the 'x' dimension separator (treat as punctuation) |
| 117 | + (r"x", Punctuation), |
| 118 | + # element / builtin types inside angle brackets (no word-boundary) |
| 119 | + # NOTE: please sync changes to corresponding builtin type rule in "root" |
| 120 | + ( |
| 121 | + r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))", |
| 122 | + Keyword.Type, |
| 123 | + ), |
| 124 | + # also allow nested container-like types to be recognized |
| 125 | + ( |
| 126 | + r"\b(complex|memref|tensor|tuple|vector)\s*(<)", |
| 127 | + bygroups(Keyword.Type, Punctuation), |
| 128 | + "angled-type", |
| 129 | + ), |
| 130 | + # fall back to root rules for anything else |
| 131 | + include("root"), |
| 132 | + ], |
38 | 133 | } |
0 commit comments