Skip to content

Commit 7ac6a95

Browse files
authored
[MLIR][Pygments] Refine the pygments MLIR lexer (#166406)
Recently, the MLIR website added API documentation for the Python bindings generated via Sphinx ([https://mlir.llvm.org/python-bindings/](https://mlir.llvm.org/python-bindings/)). In [https://github.com/llvm/mlir-www/pull/245](https://github.com/llvm/mlir-www/pull/245), I introduced the Pygments lexer from the MLIR repository to enable syntax highlighting for MLIR code blocks in these API docs. However, since the existing Pygments lexer was fairly minimal, it didn’t fully handle all aspects of the MLIR syntax, leading to imperfect highlighting in some cases. In this PR, I used ChatGPT to rewrite the lexer by combining it with the TextMate grammar for MLIR ([https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json](https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json)). After some manual adjustments, the results look good—so I’m submitting this to improve the syntax highlighting experience in the Python bindings API documentation.
1 parent cdc3cb2 commit 7ac6a95

File tree

1 file changed

+117
-22
lines changed

1 file changed

+117
-22
lines changed

mlir/utils/pygments/mlir_lexer.py

Lines changed: 117 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,132 @@
22
# See https://llvm.org/LICENSE.txt for license information.
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5-
from pygments.lexer import RegexLexer
5+
from pygments.lexer import RegexLexer, bygroups, include, using
66
from pygments.token import *
7+
import re
78

89

910
class MlirLexer(RegexLexer):
11+
"""Pygments lexer for MLIR.
12+
13+
This lexer focuses on accurate tokenization of common MLIR constructs:
14+
- SSA values (%%... / %...)
15+
- attribute and type aliases (#name =, !name =)
16+
- types (builtin and dialect types, parametric types)
17+
- attribute dictionaries and nested containers to a reasonable depth
18+
- numbers (ints, floats with exponents, hex)
19+
- strings with common escapes
20+
- line comments (// ...)
21+
- block labels (^foo) and operations
22+
"""
23+
1024
name = "MLIR"
1125
aliases = ["mlir"]
1226
filenames = ["*.mlir"]
1327

28+
flags = re.MULTILINE
29+
30+
class VariableList(RegexLexer):
31+
"""Lexer for lists of SSA variables separated by commas."""
32+
33+
tokens = {
34+
"root": [
35+
(r"\s+", Text),
36+
(r",", Punctuation),
37+
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
38+
]
39+
}
40+
1441
tokens = {
1542
"root": [
16-
(r"%[a-zA-Z0-9_]+", Name.Variable),
17-
(r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
18-
(r"\^[a-zA-Z0-9_]+", Name.Label),
19-
(r"#[a-zA-Z0-9_]+", Name.Constant),
20-
(r"![a-zA-Z0-9_]+", Keyword.Type),
21-
(r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
22-
(r"memref[^.]", Keyword.Type),
23-
(r"index", Keyword.Type),
24-
(r"i[0-9]+", Keyword.Type),
25-
(r"f[0-9]+", Keyword.Type),
43+
# Comments
44+
(r"//.*?$", Comment.Single),
45+
# operation name with assignment: %... = op.name
46+
(
47+
r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b",
48+
bygroups(Text, using(VariableList), Operator, Text, Name.Builtin),
49+
),
50+
# operation name without result
51+
(r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)),
52+
# Attribute alias definition: #name =
53+
(
54+
r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
55+
bygroups(Text, Name.Constant, Text, Operator),
56+
),
57+
# Type alias definition: !name =
58+
(
59+
r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
60+
bygroups(Text, Keyword.Type, Text, Operator),
61+
),
62+
# SSA values (uses)
63+
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
64+
# attribute refs, constants and named attributes
65+
(r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
66+
# symbol refs / function-like names
67+
(r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
68+
# blocks
69+
(r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
70+
# types by exclamation or builtin names
71+
(r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
72+
# NOTE: please sync changes to corresponding builtin type rule in "angled-type"
73+
(r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
74+
# container-like dialect types (tensor<...>, memref<...>, vector<...>)
75+
(
76+
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
77+
bygroups(Keyword.Type, Punctuation),
78+
"angled-type",
79+
),
80+
# affine constructs
81+
(r"\b(affine_map|affine_set)\b", Keyword.Reserved),
82+
# common builtin operators / functions inside affine_map
83+
(r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other),
84+
# identifiers / bare words
85+
(r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
86+
# numbers: hex, float (with exponent), integer
87+
(r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
88+
(r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
89+
(r"\b[0-9]+\b", Number.Integer),
90+
# strings
91+
(r'"', String.Double, "string"),
92+
# punctuation and arrow-like tokens
93+
(r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
94+
(r"[()\[\]{}<>,.:=]", Punctuation),
95+
# operators
96+
(r"[-+*/%]", Operator),
97+
],
98+
# string state with common escapes
99+
"string": [
100+
(r'\\[ntr"\\]', String.Escape),
101+
(r'[^"\\]+', String.Double),
102+
(r'"', String.Double, "#pop"),
103+
],
104+
# angled-type content
105+
"angled-type": [
106+
# match nested '<' and '>'
107+
(r"<", Punctuation, "#push"),
108+
(r">", Punctuation, "#pop"),
109+
# dimensions like 3x or 3x3x... and standalone numbers:
110+
# - match numbers that are followed by an 'x' (dimension separator)
111+
(r"([0-9]+)(?=(?:x))", Number.Integer),
112+
# - match bare numbers (sizes)
26113
(r"[0-9]+", Number.Integer),
27-
(r"[0-9]*\.[0-9]*", Number.Float),
28-
(r'"[^"]*"', String.Double),
29-
(r"affine_map", Keyword.Reserved),
30-
# TODO: this should be within affine maps only
31-
(r"\+-\*\/", Operator),
32-
(r"floordiv", Operator.Word),
33-
(r"ceildiv", Operator.Word),
34-
(r"mod", Operator.Word),
35-
(r"()\[\]<>,{}", Punctuation),
36-
(r"\/\/.*\n", Comment.Single),
37-
]
114+
# dynamic dimension '?'
115+
(r"\?", Name.Integer),
116+
# the 'x' dimension separator (treat as punctuation)
117+
(r"x", Punctuation),
118+
# element / builtin types inside angle brackets (no word-boundary)
119+
# NOTE: please sync changes to corresponding builtin type rule in "root"
120+
(
121+
r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
122+
Keyword.Type,
123+
),
124+
# also allow nested container-like types to be recognized
125+
(
126+
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
127+
bygroups(Keyword.Type, Punctuation),
128+
"angled-type",
129+
),
130+
# fall back to root rules for anything else
131+
include("root"),
132+
],
38133
}

0 commit comments

Comments
 (0)