enable compilation cache with grok (#395)

mayuyuace · web-flow · commit e0d1edcd63f3 · 2024-07-15T09:53:10.000+08:00
diff --git a/example/grok/README.md b/example/grok/README.md
@@ -2,6 +2,8 @@
 
 Loading and running the Grok-1 open-weights model by [Grok-1](https://github.com/xai-org/grok-1)
 
+The Grok-1 model running needs at least 8-tile GPU device.
+
 ## 1. Install intel-extension-for-openxla
 
 Please got the [main page](https://github.com/intel/intel-extension-for-openxla/blob/main/README.md#build-and-install), and follow the instructions to build and install intel-extension-for-openxla.
diff --git a/example/grok/inference.py b/example/grok/inference.py
@@ -3,6 +3,7 @@
 import time
 import json
 import os
+import jax
 
 from model import LanguageModelConfig, TransformerConfig
 from runners import InferenceRunner, ModelRunner, sample_from_model
@@ -12,9 +13,15 @@ def main(args):
     num_warmup = args.num_warmup
     input_tokens = args.input_tokens
     max_new_tokens = args.max_new_tokens
+    compilcation_cache = args.compilcation_cache
     input_len = int(input_tokens)
 
     current_path = str(os.path.dirname(__file__))
+
+    if compilcation_cache:
+        COMPILATION_CACHE_PATH = current_path +"/compilcation_cache/"
+        jax.config.update("jax_compilation_cache_dir", COMPILATION_CACHE_PATH)
+
     CKPT_PATH = current_path +"/checkpoints/"
     with open(current_path + "/prompt.json") as f:
         content = f.read()
@@ -86,5 +93,6 @@ def main(args):
     parser.add_argument("--num-warmup", default=1, type=int, help="num warmup")
     parser.add_argument("--input-tokens",default="32",choices=["32", "64", "128", "256", "512", "1024", "2016", "2017", "2048", "4096", "8192"],type=str,help="input tokens length if needed from prompt.json")
     parser.add_argument("--max-new-tokens", default=32, type=int, help="output max new tokens")
+    parser.add_argument("--compilcation-cache", default=False, type=bool, help="compilcation cache")
     args = parser.parse_args()
     main(args)
diff --git a/third_party/openxla.patch b/third_party/openxla.patch
@@ -1974,7 +1974,7 @@ index 0aa610fc9..3c4b34ace 100644
                          MatrixIsColumnMajor(instr, gemm_backend_config));
  
 diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc
-index d0c20aa1c..86ac26006 100644
+index d0c20aa1c..98ce30ebe 100644
 --- a/xla/service/gpu/gpu_compiler.cc
 +++ b/xla/service/gpu/gpu_compiler.cc
 @@ -209,6 +209,7 @@ limitations under the License.
@@ -2115,24 +2115,7 @@ index d0c20aa1c..86ac26006 100644
  }
  
  HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
-@@ -2148,6 +2175,7 @@ HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
- 
- absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
-     Executable* executable) const {
-+#if 0
-   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
-   if (!gpu_executable) return Internal("GpuExecutable is null");
- 
-@@ -2155,6 +2183,8 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
-       &gpu_executable->module(), gpu_executable->buffer_assignment(),
-       gpu_executable->text(), gpu_executable->binary(),
-       gpu_executable->dnn_compiled_graphs());
-+#endif
-+  LOG(FATAL) << "GpuCompiler::Export is not implemented";
- }
- 
- absl::Status GpuCompiler::RunPostSchedulingPipelines(
-@@ -2215,13 +2245,18 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
+@@ -2215,13 +2242,18 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
      auto driver_version = se::gpu::GpuDriver::GetDriverVersion();
  #if GOOGLE_CUDA
      constexpr int toolkit_version = CUDA_VERSION;