feat: antlr4 profiling sparksql with java

liuxy0551 · liuxy0551 · commit 17c4aa2bb2e4 · 2025-11-19T19:32:49.000+08:00
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,38 @@
+
+## 使用方式
+
+1. 构建镜像
+``` sh
+docker build -f ./docker/antlr4-profiling.Dockerfile -t antlr4-profiling .
+```
+
+2. 运行容器
+``` sh
+docker run -d -it --name antlr4-profiling -v ./src/grammar:/grammar antlr4-profiling
+```
+
+3. 进入容器
+``` sh
+docker exec -it antlr4-profiling bash
+```
+
+> **注意**：以下 java 命令需要进入指定目录，否则 java 类会找不到报错
+
+``` sh
+cd /grammar/spark
+```
+
+4. 在容器中执行，生成 Java 版解析器
+``` sh
+antlr4 -Dlanguage=Java -visitor -no-listener ./SparkSqlLexer.g4 ./SparkSqlParser.g4
+```
+
+5. 编译 Java 文件
+``` sh
+javac -cp .:/usr/local/lib/antlr-4.13.1-complete.jar SparkSqlProfiling.java
+```
+
+6. 运行 Java 程序
+``` sh
+java -cp .:/usr/local/lib/antlr-4.13.1-complete.jar SparkSqlProfiling "SELECT * FROM a WHERE b = 1"
+```
diff --git a/docker/antlr4-profiling.Dockerfile b/docker/antlr4-profiling.Dockerfile
@@ -0,0 +1,19 @@
+FROM registry.cn-hangzhou.aliyuncs.com/liuxy0551/eclipse-temurin:17-jdk-jammy
+
+# 安装必要工具
+RUN apt-get update && apt-get install -y curl unzip vim && apt-get clean
+
+# 下载 ANTLR jar
+RUN curl -O https://www.antlr.org/download/antlr-4.13.1-complete.jar && \
+    mv antlr-4.13.1-complete.jar /usr/local/lib/
+
+# 设置环境变量
+RUN  echo 'export CLASSPATH=".:/usr/local/lib/antlr-4.13.1-complete.jar:$CLASSPATH"' >> ~/.bashrc \
+    && echo 'alias antlr4="java -jar /usr/local/lib/antlr-4.13.1-complete.jar"' >> ~/.bashrc \
+    && echo 'alias grun="java org.antlr.v4.gui.TestRig"' >> ~/.bashrc
+
+# 工作目录
+WORKDIR /grammar
+
+# 默认命令保持 bash
+CMD ["bash"]
diff --git a/src/grammar/spark/SparkSqlLexer.g4 b/src/grammar/spark/SparkSqlLexer.g4
@@ -29,23 +29,6 @@ options {
     caseInsensitive= true;
 }
 
-@members {
-  /**
-   * When true, parser should throw ParseException for unclosed bracketed comment.
-   */
-  public has_unclosed_bracketed_comment = false;
-
-  /**
-   * This method will be called when the character stream ends and try to find out the
-   * unclosed bracketed comment.
-   * If the method be called, it means the end of the entire character stream match,
-   * and we set the flag and fail later.
-   */
-  public markUnclosedComment() {
-    this.has_unclosed_bracketed_comment = true;
-  }
-}
-
 SEMICOLON: ';';
 
 LEFT_PAREN    : '(';
@@ -478,8 +461,7 @@ fragment LETTER: [A-Z];
 
 LINE_COMMENT: '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN);
 
-BRACKETED_COMMENT:
-    '/*' (BRACKETED_COMMENT | .)*? ('*/' | {this.markUnclosedComment();} EOF) -> channel(HIDDEN);
+BRACKETED_COMMENT: '/*' (BRACKETED_COMMENT | .)*? '*/' -> channel(HIDDEN);
 
 WHITE_SPACE: (' ' | '\t' | '\r' | '\n') -> channel(HIDDEN);
 
diff --git a/src/grammar/spark/SparkSqlParser.g4 b/src/grammar/spark/SparkSqlParser.g4
@@ -27,11 +27,6 @@ parser grammar SparkSqlParser;
 options {
     tokenVocab=SparkSqlLexer;
     caseInsensitive= true;
-    superClass=SQLParserBase;
-}
-
-@header {
-import { SQLParserBase } from '../SQLParserBase';
 }
 
 program
@@ -415,7 +410,6 @@ viewName
 
 columnName
     : multipartIdentifier
-    | {this.shouldMatchEmpty()}?
     ;
 
 columnNamePath
diff --git a/src/grammar/spark/SparkSqlProfiling.java b/src/grammar/spark/SparkSqlProfiling.java
@@ -0,0 +1,29 @@
+import org.antlr.v4.runtime.*;
+import org.antlr.v4.runtime.atn.PredictionMode;
+
+public class SparkSqlProfiling {
+    public static void main(String[] args) throws Exception {
+        if(args.length == 0){
+            System.out.println("请传入 SQL 测试语句，例如: java SparkSqlProfiling \"SELECT * FROM a WHERE b = 1\"");
+            return;
+        }
+
+        String sql = String.join(" ", args);
+
+        // 创建 Lexer & Parser
+        SparkSqlLexer lexer = new SparkSqlLexer(CharStreams.fromString(sql));
+        CommonTokenStream tokens = new CommonTokenStream(lexer);
+        SparkSqlParser parser = new SparkSqlParser(tokens);
+
+        // 开启 LL 回溯性能分析
+        parser.getInterpreter().setPredictionMode(PredictionMode.LL_EXACT_AMBIG_DETECTION);
+        parser.addErrorListener(new DiagnosticErrorListener(true));
+
+        // 入口规则
+        parser.singleStatement();
+
+        // 输出 profiling 信息
+        System.out.println(parser.getParseInfo());
+    }
+}
+