feat: antlr4 profiling sparksql with java

liuxy0551 · liuxy0551 · commit 6085fdfd47ef · 2025-11-19T19:42:57.000+08:00
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,42 @@
+
+## 使用方式
+
+1. 构建镜像
+``` sh
+docker build -f ./docker/antlr4-profiling.Dockerfile -t antlr4-profiling .
+```
+
+2. 运行容器
+``` sh
+docker run -d -it --name antlr4-profiling -v ./src/grammar:/grammar antlr4-profiling
+```
+
+3. 进入容器
+``` sh
+docker exec -it antlr4-profiling bash
+```
+
+> **注意**
+> 1. 进行 ANTLR4 Profiling 时，需要删除部分 ts 相关的代码，否则会报错。如下：
+> - SparkSqlLexer.g4 需要删除 @members 的内容；SparkSqlParser.g4 需要删除 @@header 和 shouldMatchEmpty 的内容
+>
+> 2. 以下 java 命令需要进入容器的指定目录，否则 java 类会找不到报错
+
+``` sh
+cd /grammar/spark
+```
+
+4. 在容器中执行，生成 Java 版解析器
+``` sh
+antlr4 -Dlanguage=Java -visitor -no-listener ./SparkSqlLexer.g4 ./SparkSqlParser.g4
+```
+
+5. 编译 Java 文件
+``` sh
+javac -cp .:/usr/local/lib/antlr-4.13.1-complete.jar SparkSqlProfiling.java
+```
+
+6. 运行 Java 程序
+``` sh
+java -cp .:/usr/local/lib/antlr-4.13.1-complete.jar SparkSqlProfiling "SELECT * FROM a WHERE b = 1"
+```
diff --git a/docker/antlr4-profiling.Dockerfile b/docker/antlr4-profiling.Dockerfile
@@ -0,0 +1,19 @@
+FROM registry.cn-hangzhou.aliyuncs.com/liuxy0551/eclipse-temurin:17-jdk-jammy
+
+# 安装必要工具
+RUN apt-get update && apt-get install -y curl unzip vim && apt-get clean
+
+# 下载 ANTLR jar
+RUN curl -O https://www.antlr.org/download/antlr-4.13.1-complete.jar && \
+    mv antlr-4.13.1-complete.jar /usr/local/lib/
+
+# 设置环境变量
+RUN  echo 'export CLASSPATH=".:/usr/local/lib/antlr-4.13.1-complete.jar:$CLASSPATH"' >> ~/.bashrc \
+    && echo 'alias antlr4="java -jar /usr/local/lib/antlr-4.13.1-complete.jar"' >> ~/.bashrc \
+    && echo 'alias grun="java org.antlr.v4.gui.TestRig"' >> ~/.bashrc
+
+# 工作目录
+WORKDIR /grammar
+
+# 默认命令保持 bash
+CMD ["bash"]
diff --git a/src/grammar/spark/SparkSqlLexer.g4 b/src/grammar/spark/SparkSqlLexer.g4
@@ -29,23 +29,6 @@ options {
     caseInsensitive= true;
 }
 
-@members {
-  /**
-   * When true, parser should throw ParseException for unclosed bracketed comment.
-   */
-  public has_unclosed_bracketed_comment = false;
-
-  /**
-   * This method will be called when the character stream ends and try to find out the
-   * unclosed bracketed comment.
-   * If the method be called, it means the end of the entire character stream match,
-   * and we set the flag and fail later.
-   */
-  public markUnclosedComment() {
-    this.has_unclosed_bracketed_comment = true;
-  }
-}
-
 SEMICOLON: ';';
 
 LEFT_PAREN    : '(';
@@ -478,8 +461,7 @@ fragment LETTER: [A-Z];
 
 LINE_COMMENT: '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN);
 
-BRACKETED_COMMENT:
-    '/*' (BRACKETED_COMMENT | .)*? ('*/' | {this.markUnclosedComment();} EOF) -> channel(HIDDEN);
+BRACKETED_COMMENT: '/*' (BRACKETED_COMMENT | .)*? '*/' -> channel(HIDDEN);
 
 WHITE_SPACE: (' ' | '\t' | '\r' | '\n') -> channel(HIDDEN);
 
diff --git a/src/grammar/spark/SparkSqlParser.g4 b/src/grammar/spark/SparkSqlParser.g4
@@ -27,11 +27,6 @@ parser grammar SparkSqlParser;
 options {
     tokenVocab=SparkSqlLexer;
     caseInsensitive= true;
-    superClass=SQLParserBase;
-}
-
-@header {
-import { SQLParserBase } from '../SQLParserBase';
 }
 
 program
@@ -415,7 +410,6 @@ viewName
 
 columnName
     : multipartIdentifier
-    | {this.shouldMatchEmpty()}?
     ;
 
 columnNamePath